Merge branch 'master' of https://github.com/derekcoding1/StartupStrategy

SafeguardLi · SafeguardLi · commit 470d3a2cfbc5 · 2020-11-17T21:03:39.000-05:00
diff --git a/nlp/embeddings.ipynb b/nlp/embeddings.ipynb
@@ -39,7 +39,11 @@
     {
       "cell_type": "code",
       "metadata": {
-        "id": "_m-Kb02irFcO"
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "_m-Kb02irFcO",
+        "outputId": "56c60d63-a268-45e2-f464-6f607bb33600"
       },
       "source": [
         "import pandas as pd\n",
@@ -62,8 +66,17 @@
         "from nltk.corpus import stopwords\n",
         "import gcld3"
       ],
-      "execution_count": null,
-      "outputs": []
+      "execution_count": 25,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
+            "[nltk_data]   Package stopwords is already up-to-date!\n"
+          ],
+          "name": "stdout"
+        }
+      ]
     },
     {
       "cell_type": "code",
@@ -130,7 +143,7 @@
         "id": "lDVL8OMHLldP"
       },
       "source": [
-        "Txt_all_frame = pd.read_csv(\"drive/My Drive/Capstone Shared Docs/result/complete_df/year_2012.csv\")"
+        "Txt_all_frame = pd.read_csv(\"drive/My Drive/Capstone Shared Docs/result/complete_df/year_2013.csv\")"
       ],
       "execution_count": 10,
       "outputs": []
@@ -146,6 +159,17 @@
       "execution_count": 12,
       "outputs": []
     },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "9_qpNEX5WOkk"
+      },
+      "source": [
+        "Txt_all_frame.head()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
     {
       "cell_type": "code",
       "metadata": {
@@ -356,16 +380,17 @@
         "%%time\n",
         "docs_train = []\n",
         "\n",
-        "pct = int(np.percentile(Txt_all_frame_en.Text_len, 95))\n",
         "\n",
-        "for i in range(len(Complete_frame)):\n",
+        "pct = int(np.percentile(Txt_all_frame_en.text_len, 95))\n",
+        "\n",
+        "for i in range(len(Txt_all_frame_en)):\n",
         "\n",
-        "  if len(Complete_frame['text_cleaned'][i]) > pct :\n",
-        "    v = sbert_model.encode(Complete_frame['text_cleaned'][i][:pct])\n",
+        "  if len(Txt_all_frame_en['text_cleaned'][i]) > pct :\n",
+        "    v = sbert_model.encode(Txt_all_frame_en['text_cleaned'][i][:pct])\n",
         "    docs_train.append(v)\n",
         "  \n",
         "  else:\n",
-        "    v = sbert_model.encode(Complete_frame['text_cleaned'][i])\n",
+        "    v = sbert_model.encode(Txt_all_frame_en['text_cleaned'][i])\n",
         "    docs_train.append(v)\n",
         "X_train = np.vstack([d.T for d in docs_train])"
       ],
@@ -574,7 +599,7 @@
         "id": "9n-hH42rOCjc"
       },
       "source": [
-        "startups.to_csv(\"drive/My Drive/Capstone Shared Docs/strategy score/_year.csv\",index=False)"
+        "startups.to_csv(\"drive/My Drive/Capstone Shared Docs/strategy score/word2_vec_2013.csv\",index=False)"
       ],
       "execution_count": null,
       "outputs": []