Skip to content

Commit 470d3a2

Browse files
committed
2 parents 12b582c + 0fade59 commit 470d3a2

File tree

1 file changed

+35
-10
lines changed

1 file changed

+35
-10
lines changed

nlp/embeddings.ipynb

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,11 @@
3939
{
4040
"cell_type": "code",
4141
"metadata": {
42-
"id": "_m-Kb02irFcO"
42+
"colab": {
43+
"base_uri": "https://localhost:8080/"
44+
},
45+
"id": "_m-Kb02irFcO",
46+
"outputId": "56c60d63-a268-45e2-f464-6f607bb33600"
4347
},
4448
"source": [
4549
"import pandas as pd\n",
@@ -62,8 +66,17 @@
6266
"from nltk.corpus import stopwords\n",
6367
"import gcld3"
6468
],
65-
"execution_count": null,
66-
"outputs": []
69+
"execution_count": 25,
70+
"outputs": [
71+
{
72+
"output_type": "stream",
73+
"text": [
74+
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
75+
"[nltk_data] Package stopwords is already up-to-date!\n"
76+
],
77+
"name": "stdout"
78+
}
79+
]
6780
},
6881
{
6982
"cell_type": "code",
@@ -130,7 +143,7 @@
130143
"id": "lDVL8OMHLldP"
131144
},
132145
"source": [
133-
"Txt_all_frame = pd.read_csv(\"drive/My Drive/Capstone Shared Docs/result/complete_df/year_2012.csv\")"
146+
"Txt_all_frame = pd.read_csv(\"drive/My Drive/Capstone Shared Docs/result/complete_df/year_2013.csv\")"
134147
],
135148
"execution_count": 10,
136149
"outputs": []
@@ -146,6 +159,17 @@
146159
"execution_count": 12,
147160
"outputs": []
148161
},
162+
{
163+
"cell_type": "code",
164+
"metadata": {
165+
"id": "9_qpNEX5WOkk"
166+
},
167+
"source": [
168+
"Txt_all_frame.head()"
169+
],
170+
"execution_count": null,
171+
"outputs": []
172+
},
149173
{
150174
"cell_type": "code",
151175
"metadata": {
@@ -356,16 +380,17 @@
356380
"%%time\n",
357381
"docs_train = []\n",
358382
"\n",
359-
"pct = int(np.percentile(Txt_all_frame_en.Text_len, 95))\n",
360383
"\n",
361-
"for i in range(len(Complete_frame)):\n",
384+
"pct = int(np.percentile(Txt_all_frame_en.text_len, 95))\n",
385+
"\n",
386+
"for i in range(len(Txt_all_frame_en)):\n",
362387
"\n",
363-
" if len(Complete_frame['text_cleaned'][i]) > pct :\n",
364-
" v = sbert_model.encode(Complete_frame['text_cleaned'][i][:pct])\n",
388+
" if len(Txt_all_frame_en['text_cleaned'][i]) > pct :\n",
389+
" v = sbert_model.encode(Txt_all_frame_en['text_cleaned'][i][:pct])\n",
365390
" docs_train.append(v)\n",
366391
" \n",
367392
" else:\n",
368-
" v = sbert_model.encode(Complete_frame['text_cleaned'][i])\n",
393+
" v = sbert_model.encode(Txt_all_frame_en['text_cleaned'][i])\n",
369394
" docs_train.append(v)\n",
370395
"X_train = np.vstack([d.T for d in docs_train])"
371396
],
@@ -574,7 +599,7 @@
574599
"id": "9n-hH42rOCjc"
575600
},
576601
"source": [
577-
"startups.to_csv(\"drive/My Drive/Capstone Shared Docs/strategy score/_year.csv\",index=False)"
602+
"startups.to_csv(\"drive/My Drive/Capstone Shared Docs/strategy score/word2_vec_2013.csv\",index=False)"
578603
],
579604
"execution_count": null,
580605
"outputs": []

0 commit comments

Comments
 (0)