|
39 | 39 | { |
40 | 40 | "cell_type": "code", |
41 | 41 | "metadata": { |
42 | | - "id": "_m-Kb02irFcO" |
| 42 | + "colab": { |
| 43 | + "base_uri": "https://localhost:8080/" |
| 44 | + }, |
| 45 | + "id": "_m-Kb02irFcO", |
| 46 | + "outputId": "56c60d63-a268-45e2-f464-6f607bb33600" |
43 | 47 | }, |
44 | 48 | "source": [ |
45 | 49 | "import pandas as pd\n", |
|
62 | 66 | "from nltk.corpus import stopwords\n", |
63 | 67 | "import gcld3" |
64 | 68 | ], |
65 | | - "execution_count": null, |
66 | | - "outputs": [] |
| 69 | + "execution_count": 25, |
| 70 | + "outputs": [ |
| 71 | + { |
| 72 | + "output_type": "stream", |
| 73 | + "text": [ |
| 74 | + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", |
| 75 | + "[nltk_data] Package stopwords is already up-to-date!\n" |
| 76 | + ], |
| 77 | + "name": "stdout" |
| 78 | + } |
| 79 | + ] |
67 | 80 | }, |
68 | 81 | { |
69 | 82 | "cell_type": "code", |
|
130 | 143 | "id": "lDVL8OMHLldP" |
131 | 144 | }, |
132 | 145 | "source": [ |
133 | | - "Txt_all_frame = pd.read_csv(\"drive/My Drive/Capstone Shared Docs/result/complete_df/year_2012.csv\")" |
| 146 | + "Txt_all_frame = pd.read_csv(\"drive/My Drive/Capstone Shared Docs/result/complete_df/year_2013.csv\")" |
134 | 147 | ], |
135 | 148 | "execution_count": 10, |
136 | 149 | "outputs": [] |
|
146 | 159 | "execution_count": 12, |
147 | 160 | "outputs": [] |
148 | 161 | }, |
| 162 | + { |
| 163 | + "cell_type": "code", |
| 164 | + "metadata": { |
| 165 | + "id": "9_qpNEX5WOkk" |
| 166 | + }, |
| 167 | + "source": [ |
| 168 | + "Txt_all_frame.head()" |
| 169 | + ], |
| 170 | + "execution_count": null, |
| 171 | + "outputs": [] |
| 172 | + }, |
149 | 173 | { |
150 | 174 | "cell_type": "code", |
151 | 175 | "metadata": { |
|
356 | 380 | "%%time\n", |
357 | 381 | "docs_train = []\n", |
358 | 382 | "\n", |
359 | | - "pct = int(np.percentile(Txt_all_frame_en.Text_len, 95))\n", |
360 | 383 | "\n", |
361 | | - "for i in range(len(Complete_frame)):\n", |
| 384 | + "pct = int(np.percentile(Txt_all_frame_en.text_len, 95))\n", |
| 385 | + "\n", |
| 386 | + "for i in range(len(Txt_all_frame_en)):\n", |
362 | 387 | "\n", |
363 | | - " if len(Complete_frame['text_cleaned'][i]) > pct :\n", |
364 | | - " v = sbert_model.encode(Complete_frame['text_cleaned'][i][:pct])\n", |
| 388 | + " if len(Txt_all_frame_en['text_cleaned'][i]) > pct :\n", |
| 389 | + " v = sbert_model.encode(Txt_all_frame_en['text_cleaned'][i][:pct])\n", |
365 | 390 | " docs_train.append(v)\n", |
366 | 391 | " \n", |
367 | 392 | " else:\n", |
368 | | - " v = sbert_model.encode(Complete_frame['text_cleaned'][i])\n", |
| 393 | + " v = sbert_model.encode(Txt_all_frame_en['text_cleaned'][i])\n", |
369 | 394 | " docs_train.append(v)\n", |
370 | 395 | "X_train = np.vstack([d.T for d in docs_train])" |
371 | 396 | ], |
|
574 | 599 | "id": "9n-hH42rOCjc" |
575 | 600 | }, |
576 | 601 | "source": [ |
577 | | - "startups.to_csv(\"drive/My Drive/Capstone Shared Docs/strategy score/_year.csv\",index=False)" |
| 602 | + "startups.to_csv(\"drive/My Drive/Capstone Shared Docs/strategy score/word2_vec_2013.csv\",index=False)" |
578 | 603 | ], |
579 | 604 | "execution_count": null, |
580 | 605 | "outputs": [] |
|
0 commit comments