From 7a734ca5427a838f46bd1ad79f20b699e1efbedb Mon Sep 17 00:00:00 2001 From: Nilesh Sakpal Date: Sat, 1 May 2021 20:35:49 +0200 Subject: [PATCH 1/2] Created using Colaboratory --- Patent_Data_POS_Tagging_1.ipynb | 374 ++++++++++++++++++++++++++++++++ 1 file changed, 374 insertions(+) create mode 100644 Patent_Data_POS_Tagging_1.ipynb diff --git a/Patent_Data_POS_Tagging_1.ipynb b/Patent_Data_POS_Tagging_1.ipynb new file mode 100644 index 0000000..0cc4eda --- /dev/null +++ b/Patent_Data_POS_Tagging_1.ipynb @@ -0,0 +1,374 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Patent_Data-POS_Tagging-1.ipynb", + "provenance": [], + "authorship_tag": "ABX9TyNmnJcNaUjOMOB3fBXJsmmn", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tFi-oWCW-yUs" + }, + "source": [ + "First step is to import Pandas which will help us get our data in the program. " + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "1IY2T1Ak1h8I" + }, + "source": [ + "import pandas as pd" + ], + "execution_count": 1, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5h4wWvsS-_nU" + }, + "source": [ + "Second step is to import the file where we have the dataset. After running the next command, you will get an option of selecting the file needed" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "H9beN2vm-7Fo" + }, + "source": [ + "from google.colab import files\n", + "uploaded = files.upload()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PFBVeo-x_Lrm" + }, + "source": [ + "Once the file is uploaded above, the next command specifies the file name and converts the dataset into a pandas Dataframe, which can be manipulated by the program" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "AWiLDB3p_DTF" + }, + "source": [ + "import io\n", + "df2 = pd.read_excel(io.BytesIO(uploaded['Set1.xls']))" + ], + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vJ9PAhzY-nil" + }, + "source": [ + "The dataset will be saved as a Pandas Dataframe. In the next line of code, we can print this dataframe to preview it and see if it is in order." + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "bdxYMt18ApIp", + "outputId": "64b2a91a-77f0-4aa8-f53b-8de5cd57c541" + }, + "source": [ + "print(df2)" + ], + "execution_count": 26, + "outputs": [ + { + "output_type": "stream", + "text": [ + " # ... Abstract\n", + "0 1 ... An electric system is disclosed. The electric ...\n", + "1 2 ... The present disclosure discloses a charging de...\n", + "2 3 ... A charge/discharge control method for a batter...\n", + "3 4 ... An uninterruptible power supply includes a swi...\n", + "4 5 ... A system and method for hierarchical arc fault...\n", + "5 6 ... Methods and apparatus for controlling charge c...\n", + "6 7 ... A circuit provides for regulating charge and d...\n", + "7 8 ... A power bank has a station and a plurality of ...\n", + "8 9 ... A device and method are provided for saving po...\n", + "9 10 ... A battery system for a vehicle includes: a bat...\n", + "10 11 ... A cell for an electrical energy store is provi...\n", + "11 12 ... Provided is a nonaqueous electrolyte battery s...\n", + "12 13 ... A battery case includes a first cell accommoda...\n", + "13 14 ... An electrochemical pouch cell includes a pouch...\n", + "14 15 ... The present invention relates to a rechargeabl...\n", + "15 16 ... Disclosed is a method for manufacturing a batt...\n", + "16 17 ... An energy storage system for a motor vehicle i...\n", + "17 18 ... A thermal interface member may comprise a subs...\n", + "18 19 ... An energy storage container and a heat dissipa...\n", + "19 20 ... The present disclosure provides a battery heat...\n", + "20 21 ... A method for operating a battery of an at leas...\n", + "21 22 ... Provided are temperature monitoring apparatus ...\n", + "22 23 ... The invention relates to a method for operatin...\n", + "23 24 ... A system for supplying power to a portable bat...\n", + "24 25 ... A vehicle includes a traction battery having c...\n", + "25 26 ... An electrochemical cell has a flexible low-pro...\n", + "26 27 ... A control module is arranged side by side with...\n", + "27 28 ... A battery pack (10) is a battery pack that is ...\n", + "28 29 ... A solid state battery (10) including a stack o...\n", + "29 30 ... A battery includes first and second power gene...\n", + "\n", + "[30 rows x 4 columns]\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UYXLuZYFKs5B" + }, + "source": [ + "Notice that the left side gives numbers for rows and the top columns have the same names as the excel file. The first row is 0 and the last row is 29 making it 30 rows in total and 4 columns. We can select a single cell by specifying its row and column. Lets select the cell on the 10th row (number 9) under the column Abstract" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 108 + }, + "id": "MOjo7BpNGBRX", + "outputId": "c1ef98e0-fe60-4aad-c219-1c8e183c1c7b" + }, + "source": [ + "df2.iloc[9]['Abstract']\n" + ], + "execution_count": 27, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'A battery system for a\\xa0vehicle\\xa0includes: a battery module including a plurality of secondary battery cells; a gas sensor; and a housing accommodating the battery module and the gas sensor. At least a portion of an exterior surface of the battery module and/or at least a portion of an interior surface of the housing is covered by a coating. The coating is configured to emit a gaseous species when a temperature exceeds a reference temperature, and the gas sensor is configured to detect the gaseous species.'" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 27 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tjTlzOqQLg3W" + }, + "source": [ + "Now for the final step, we will break this text from the Abstract into noun phrases using Textblob.\n", + "\n", + "For this, we have to import Textblob and NLTK" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "d2VWDBE9Vgj4" + }, + "source": [ + "from textblob import TextBlob\n", + "import nltk\n", + "nltk.download('brown')\n", + "nltk.download('punkt')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HRHQMbOHL2n8" + }, + "source": [ + "We give the cell value above as input to Textblob Noun phrase extractor to get noun phrases from its text" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "YumB9PObVj-z", + "outputId": "9d782604-0348-4ec0-ec9d-ffea8102ea00" + }, + "source": [ + "TextBlob.np_extractor.extract(df2.iloc[9]['Abstract'])" + ], + "execution_count": 28, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['battery system',\n", + " 'battery module',\n", + " 'secondary battery cells',\n", + " 'gas sensor',\n", + " 'battery module',\n", + " 'gas sensor',\n", + " 'exterior surface',\n", + " 'battery module and/or',\n", + " 'interior surface',\n", + " 'gaseous species',\n", + " 'temperature exceeds',\n", + " 'reference temperature',\n", + " 'gas sensor',\n", + " 'gaseous species']" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 28 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5B7QNuklMOpu" + }, + "source": [ + "Notice that some phrases are repeated. This duplication can either be removed or even used to count the number of times a phrase appears according to the end use planned for them. " + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "lp6WtReMVtVH" + }, + "source": [ + "b.noun_phrases" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nZtl4wiHMhdx" + }, + "source": [ + "Let's get these phrases for all the abstracts in our list. For this we could loop this code using a For Loop, and enter the phrases output in a python variable called dictionary. \n", + "\n", + "We name this dictionary phrasecollect" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Gh12gL9PXrui" + }, + "source": [ + "phrasescollect = dict()" + ], + "execution_count": 29, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "tqvsY-vBW3XQ" + }, + "source": [ + "for i in range(0, 30):\n", + " phrasescollect[i] = TextBlob.np_extractor.extract(df2.iloc[i]['Abstract'])\n", + " phrasescollect[i] = list(dict.fromkeys(phrasescollect[i]))" + ], + "execution_count": 25, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OI679kIcNAoZ" + }, + "source": [ + "For this demo, I have added a line of code to remove the duplicate phrases. This is the second line phrasescollect[i] = list(dict.fromkeys(phrasescollect[i]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oOq9EG-ONOyi" + }, + "source": [ + "Now that we have all the phrases, lets print the dictionary to see the output" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "gQumCGB9YCyO", + "outputId": "edc1acb5-d748-4e75-d788-4e02d129606e" + }, + "source": [ + "phrasescollect" + ], + "execution_count": 31, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{}" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 31 + } + ] + } + ] +} \ No newline at end of file From 5f9cd4f1960aa6a8a18e330071a14bac900daeb0 Mon Sep 17 00:00:00 2001 From: Nilesh Sakpal Date: Sat, 1 May 2021 20:36:23 +0200 Subject: [PATCH 2/2] Delete Patent_Data_POS_Tagging_1.ipynb --- Patent_Data_POS_Tagging_1.ipynb | 374 -------------------------------- 1 file changed, 374 deletions(-) delete mode 100644 Patent_Data_POS_Tagging_1.ipynb diff --git a/Patent_Data_POS_Tagging_1.ipynb b/Patent_Data_POS_Tagging_1.ipynb deleted file mode 100644 index 0cc4eda..0000000 --- a/Patent_Data_POS_Tagging_1.ipynb +++ /dev/null @@ -1,374 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "Patent_Data-POS_Tagging-1.ipynb", - "provenance": [], - "authorship_tag": "ABX9TyNmnJcNaUjOMOB3fBXJsmmn", - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tFi-oWCW-yUs" - }, - "source": [ - "First step is to import Pandas which will help us get our data in the program. " - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "1IY2T1Ak1h8I" - }, - "source": [ - "import pandas as pd" - ], - "execution_count": 1, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5h4wWvsS-_nU" - }, - "source": [ - "Second step is to import the file where we have the dataset. After running the next command, you will get an option of selecting the file needed" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "H9beN2vm-7Fo" - }, - "source": [ - "from google.colab import files\n", - "uploaded = files.upload()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PFBVeo-x_Lrm" - }, - "source": [ - "Once the file is uploaded above, the next command specifies the file name and converts the dataset into a pandas Dataframe, which can be manipulated by the program" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "AWiLDB3p_DTF" - }, - "source": [ - "import io\n", - "df2 = pd.read_excel(io.BytesIO(uploaded['Set1.xls']))" - ], - "execution_count": 3, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vJ9PAhzY-nil" - }, - "source": [ - "The dataset will be saved as a Pandas Dataframe. In the next line of code, we can print this dataframe to preview it and see if it is in order." - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "bdxYMt18ApIp", - "outputId": "64b2a91a-77f0-4aa8-f53b-8de5cd57c541" - }, - "source": [ - "print(df2)" - ], - "execution_count": 26, - "outputs": [ - { - "output_type": "stream", - "text": [ - " # ... Abstract\n", - "0 1 ... An electric system is disclosed. The electric ...\n", - "1 2 ... The present disclosure discloses a charging de...\n", - "2 3 ... A charge/discharge control method for a batter...\n", - "3 4 ... An uninterruptible power supply includes a swi...\n", - "4 5 ... A system and method for hierarchical arc fault...\n", - "5 6 ... Methods and apparatus for controlling charge c...\n", - "6 7 ... A circuit provides for regulating charge and d...\n", - "7 8 ... A power bank has a station and a plurality of ...\n", - "8 9 ... A device and method are provided for saving po...\n", - "9 10 ... A battery system for a vehicle includes: a bat...\n", - "10 11 ... A cell for an electrical energy store is provi...\n", - "11 12 ... Provided is a nonaqueous electrolyte battery s...\n", - "12 13 ... A battery case includes a first cell accommoda...\n", - "13 14 ... An electrochemical pouch cell includes a pouch...\n", - "14 15 ... The present invention relates to a rechargeabl...\n", - "15 16 ... Disclosed is a method for manufacturing a batt...\n", - "16 17 ... An energy storage system for a motor vehicle i...\n", - "17 18 ... A thermal interface member may comprise a subs...\n", - "18 19 ... An energy storage container and a heat dissipa...\n", - "19 20 ... The present disclosure provides a battery heat...\n", - "20 21 ... A method for operating a battery of an at leas...\n", - "21 22 ... Provided are temperature monitoring apparatus ...\n", - "22 23 ... The invention relates to a method for operatin...\n", - "23 24 ... A system for supplying power to a portable bat...\n", - "24 25 ... A vehicle includes a traction battery having c...\n", - "25 26 ... An electrochemical cell has a flexible low-pro...\n", - "26 27 ... A control module is arranged side by side with...\n", - "27 28 ... A battery pack (10) is a battery pack that is ...\n", - "28 29 ... A solid state battery (10) including a stack o...\n", - "29 30 ... A battery includes first and second power gene...\n", - "\n", - "[30 rows x 4 columns]\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UYXLuZYFKs5B" - }, - "source": [ - "Notice that the left side gives numbers for rows and the top columns have the same names as the excel file. The first row is 0 and the last row is 29 making it 30 rows in total and 4 columns. We can select a single cell by specifying its row and column. Lets select the cell on the 10th row (number 9) under the column Abstract" - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 108 - }, - "id": "MOjo7BpNGBRX", - "outputId": "c1ef98e0-fe60-4aad-c219-1c8e183c1c7b" - }, - "source": [ - "df2.iloc[9]['Abstract']\n" - ], - "execution_count": 27, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - }, - "text/plain": [ - "'A battery system for a\\xa0vehicle\\xa0includes: a battery module including a plurality of secondary battery cells; a gas sensor; and a housing accommodating the battery module and the gas sensor. At least a portion of an exterior surface of the battery module and/or at least a portion of an interior surface of the housing is covered by a coating. The coating is configured to emit a gaseous species when a temperature exceeds a reference temperature, and the gas sensor is configured to detect the gaseous species.'" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 27 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tjTlzOqQLg3W" - }, - "source": [ - "Now for the final step, we will break this text from the Abstract into noun phrases using Textblob.\n", - "\n", - "For this, we have to import Textblob and NLTK" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "d2VWDBE9Vgj4" - }, - "source": [ - "from textblob import TextBlob\n", - "import nltk\n", - "nltk.download('brown')\n", - "nltk.download('punkt')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HRHQMbOHL2n8" - }, - "source": [ - "We give the cell value above as input to Textblob Noun phrase extractor to get noun phrases from its text" - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "YumB9PObVj-z", - "outputId": "9d782604-0348-4ec0-ec9d-ffea8102ea00" - }, - "source": [ - "TextBlob.np_extractor.extract(df2.iloc[9]['Abstract'])" - ], - "execution_count": 28, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "['battery system',\n", - " 'battery module',\n", - " 'secondary battery cells',\n", - " 'gas sensor',\n", - " 'battery module',\n", - " 'gas sensor',\n", - " 'exterior surface',\n", - " 'battery module and/or',\n", - " 'interior surface',\n", - " 'gaseous species',\n", - " 'temperature exceeds',\n", - " 'reference temperature',\n", - " 'gas sensor',\n", - " 'gaseous species']" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 28 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5B7QNuklMOpu" - }, - "source": [ - "Notice that some phrases are repeated. This duplication can either be removed or even used to count the number of times a phrase appears according to the end use planned for them. " - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "lp6WtReMVtVH" - }, - "source": [ - "b.noun_phrases" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nZtl4wiHMhdx" - }, - "source": [ - "Let's get these phrases for all the abstracts in our list. For this we could loop this code using a For Loop, and enter the phrases output in a python variable called dictionary. \n", - "\n", - "We name this dictionary phrasecollect" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Gh12gL9PXrui" - }, - "source": [ - "phrasescollect = dict()" - ], - "execution_count": 29, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "tqvsY-vBW3XQ" - }, - "source": [ - "for i in range(0, 30):\n", - " phrasescollect[i] = TextBlob.np_extractor.extract(df2.iloc[i]['Abstract'])\n", - " phrasescollect[i] = list(dict.fromkeys(phrasescollect[i]))" - ], - "execution_count": 25, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "OI679kIcNAoZ" - }, - "source": [ - "For this demo, I have added a line of code to remove the duplicate phrases. This is the second line phrasescollect[i] = list(dict.fromkeys(phrasescollect[i]))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oOq9EG-ONOyi" - }, - "source": [ - "Now that we have all the phrases, lets print the dictionary to see the output" - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "gQumCGB9YCyO", - "outputId": "edc1acb5-d748-4e75-d788-4e02d129606e" - }, - "source": [ - "phrasescollect" - ], - "execution_count": 31, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "{}" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 31 - } - ] - } - ] -} \ No newline at end of file