{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "a8be6839", "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "import janitor\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from matplotlib.ticker import MaxNLocator\n", "import math\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 4, "id": "211ba466", "metadata": {}, "outputs": [], "source": [ "outdir=\"WESTERN_CH_scope\"\n", "\n", "appln = pd.read_csv(f\"{outdir}/tls_201_scope.csv\")\n", "\n", "appln_title = pd.read_csv(f\"{outdir}/tls_202_scope.csv\")\n", "\n", "pers = pd.read_csv(f\"{outdir}/tls_206_scope.csv\")\n", "\n", "appln_pers = pd.read_csv(f\"{outdir}/tls_207_scope.csv\")\n", "\n", "appln_cpc = pd.read_csv(f\"{outdir}/tls_224_scope.csv\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "f878b151", "metadata": {}, "outputs": [], "source": [ "# workdir_path=r\"CPCTitleList202302\"\n", "# # outfile='wos_extract_complete.csv'\n", "# # with_header=True\n", "# cpc_ids = pd.DataFrame()\n", "# for root, dirs, files in os.walk(workdir_path):\n", "# for filename in files:\n", "# path=os.path.join(root, filename)\n", "# section = pd.read_csv(path, sep='\\t', header=None)\n", "# cpc_ids=pd.concat([cpc_ids,section], ignore_index=True)\n", "# cpc_ids.columns =[\"cpc_id\",\"idk\",\"cpc_name\"]\n", "# cpc_ids = cpc_ids.drop(columns=\"idk\")" ] }, { "cell_type": "code", "execution_count": 6, "id": "95ea20da", "metadata": {}, "outputs": [], "source": [ "parsed = {x: [] for x in ['code', 'title', 'section', 'class', 'subclass', 'group', 'main_group']}\n", "for letter in 'ABCDEFGHY':\n", " file = f'CPC_data/CPCTitleList202302/cpc-section-{letter}_20230201.txt'\n", " with open(file) as f:\n", " for line in f:\n", " vals = line.strip().split('\\t')\n", " if len(vals) == 2:\n", " parsed['code'].append(vals[0])\n", " parsed['title'].append(vals[1])\n", " elif len(vals) == 3:\n", " parsed['code'].append(vals[0])\n", " parsed['title'].append(vals[2])\n", "\n", "\n", "\n", "for i in range(len(parsed['code'])):\n", " code = parsed['code'][i]\n", " main_group = code.split('/')[-1] if \"/\" in code else None\n", " group = code.split('/')[0][4:] if len(code) >= 5 else None\n", " subclass = code[3] if len(code) >= 4 else None\n", " class_ = code[1:3] if len(code) >= 3 else None\n", " section = code[0] if len(code) >= 1 else None\n", " \n", " parsed['main_group'].append(main_group)\n", " parsed['group'].append(group)\n", " parsed['subclass'].append(subclass)\n", " parsed['class'].append(class_)\n", " parsed['section'].append(section)\n", "\n", "cpc_ids2023 = pd.DataFrame.from_dict(parsed)\n", "cpc_ids2023['cpc_version']=2023\n", "cpc_ids2022 = pd.read_csv(\"CPC_data/cpc_titles_2022.csv\")\n", "cpc_ids2022['cpc_version']=2022\n", "cpc_ids = pd.concat([cpc_ids2023,cpc_ids2022], ignore_index=True)\n", "cpc_ids = cpc_ids.rename(columns={\"code\":\"cpc_id\",\"title\":\"cpc_name\"}).drop_duplicates(subset=\"cpc_id\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "907d9c3e", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 7, "id": "1be8971a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "70 cpc_ids not found\n", "0.07344840249724569 % lost\n" ] } ], "source": [ "appln_cpc[\"cpc_id\"] = appln_cpc[\"cpc_class_symbol\"].str.replace(\" \",\"\")\n", "appln_cpc_tax = appln_cpc.merge(cpc_ids, on=\"cpc_id\", how=\"left\")\n", "\n", "print(len(appln_cpc_tax[appln_cpc_tax[\"cpc_name\"].isna()][\"cpc_id\"].unique()), \"cpc_ids not found\")\n", "print(len(appln_cpc_tax[appln_cpc_tax[\"cpc_name\"].isna()][\"cpc_id\"].unique())/len(appln_cpc_tax[\"cpc_id\"].unique())*100, \"% lost\")" ] }, { "cell_type": "code", "execution_count": 8, "id": "b1274c34", "metadata": {}, "outputs": [], "source": [ "cpc_dict = dict(zip(cpc_ids.cpc_id.str.replace(\" \",\"\"), cpc_ids.cpc_name))\n", "# cpc_dict" ] }, { "cell_type": "code", "execution_count": 9, "id": "2a7e39ee", "metadata": {}, "outputs": [], "source": [ "def cpc_classifier(id_text):\n", " taxonomy = []\n", " iter_text = id_text.replace(\" \",\"\")\n", " for i in range(len(iter_text)+1):\n", " tax_id = iter_text[:i]\n", " tax_name = cpc_dict.get(iter_text[:i])\n", " if tax_name:\n", " taxonomy.append((tax_id,tax_name))\n", " return taxonomy\n", " " ] }, { "cell_type": "code", "execution_count": 10, "id": "e31a013f", "metadata": {}, "outputs": [ { "data": { "text/plain": "[('A', 'HUMAN NECESSITIES'),\n ('A01',\n 'AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTING; TRAPPING; FISHING'),\n ('A01B',\n 'SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS, DETAILS, OR ACCESSORIES OF AGRICULTURAL MACHINES OR IMPLEMENTS, IN GENERAL (making or covering furrows or holes for sowing, planting, or manuring A01C5/00; soil working for engineering purposes E01, E02, E21; {measuring areas for agricultural purposes G01B})'),\n ('A01B1/06',\n 'Hoes; Hand cultivators {(rakes A01D7/00; forks A01D9/00; picks B25D)}'),\n ('A01B1/065', '{powered}')]" }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cpc_classifier(\"A01B1/065\")" ] }, { "cell_type": "code", "execution_count": 12, "id": "f09a616c", "metadata": {}, "outputs": [ { "data": { "text/plain": " cpc_id cpc_name section class \n0 A HUMAN NECESSITIES A None \\\n1 A01 AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI... A 01 \n2 A01B SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS... A 01 \n3 A01B1/00 Hand tools (edge trimmers for lawns A01G3/06 ... A 01 \n4 A01B1/02 Spades; Shovels {(hand-operated dredgers E02F3... A 01 \n\n subclass group main_group cpc_version \n0 None None None 2023 \\\n1 None None None 2023 \n2 B None None 2023 \n3 B 1 00 2023 \n4 B 1 02 2023 \n\n version https://git-lfs.github.com/spec/v1 \n0 NaN \\\n1 NaN \n2 NaN \n3 NaN \n4 NaN \n\n cpc_taxonomy \n0 [(A, HUMAN NECESSITIES)] \n1 [(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... \n2 [(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... \n3 [(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... \n4 [(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... ", "text/html": "
\n | cpc_id | \ncpc_name | \nsection | \nclass | \nsubclass | \ngroup | \nmain_group | \ncpc_version | \nversion https://git-lfs.github.com/spec/v1 | \ncpc_taxonomy | \n
---|---|---|---|---|---|---|---|---|---|---|
0 | \nA | \nHUMAN NECESSITIES | \nA | \nNone | \nNone | \nNone | \nNone | \n2023 | \nNaN | \n[(A, HUMAN NECESSITIES)] | \n
1 | \nA01 | \nAGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI... | \nA | \n01 | \nNone | \nNone | \nNone | \n2023 | \nNaN | \n[(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... | \n
2 | \nA01B | \nSOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS... | \nA | \n01 | \nB | \nNone | \nNone | \n2023 | \nNaN | \n[(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... | \n
3 | \nA01B1/00 | \nHand tools (edge trimmers for lawns A01G3/06 ... | \nA | \n01 | \nB | \n1 | \n00 | \n2023 | \nNaN | \n[(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... | \n
4 | \nA01B1/02 | \nSpades; Shovels {(hand-operated dredgers E02F3... | \nA | \n01 | \nB | \n1 | \n02 | \n2023 | \nNaN | \n[(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... | \n
\n | cpc_id | \ncpc_name | \nsection | \nclass | \nsubclass | \ngroup | \nmain_group | \ncpc_version | \nversion https://git-lfs.github.com/spec/v1 | \ncpc_taxonomy | \n
---|---|---|---|---|---|---|---|---|---|---|
12725 | \nA61B1/000096 | \n{using artificial intelligence} | \nA | \n61 | \nB | \n1 | \n000096 | \n2023 | \nNaN | \n[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... | \n
13764 | \nA61B5/7264 | \n{Classification of physiological signals or da... | \nA | \n61 | \nB | \n5 | \n7264 | \n2023 | \nNaN | \n[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... | \n
13897 | \nA61B6/52 | \n{Devices using data or image processing specia... | \nA | \n61 | \nB | \n6 | \n52 | \n2023 | \nNaN | \n[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... | \n
14016 | \nA61B8/52 | \n{Devices using data or image processing specia... | \nA | \n61 | \nB | \n8 | \n52 | \n2023 | \nNaN | \n[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... | \n
15252 | \nA61B2018/0069 | \n{using fuzzy logic} | \nA | \n61 | \nB | \n2018 | \n0069 | \n2023 | \nNaN | \n[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... | \n
... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n
250685 | \nY10S707/99946 | \nObject-oriented database structure network | \nY | \n10 | \nS | \n707 | \n99946 | \n2023 | \nNaN | \n[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... | \n
250686 | \nY10S707/99947 | \nObject-oriented database structure reference | \nY | \n10 | \nS | \n707 | \n99947 | \n2023 | \nNaN | \n[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... | \n
250687 | \nY10S707/99948 | \nApplication of database or data structure, e.g... | \nY | \n10 | \nS | \n707 | \n99948 | \n2023 | \nNaN | \n[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... | \n
250688 | \nY10S707/99951 | \nFile or database maintenance | \nY | \n10 | \nS | \n707 | \n99951 | \n2023 | \nNaN | \n[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... | \n
250703 | \nY10S715/968 | \ninterface for database querying and retrieval | \nY | \n10 | \nS | \n715 | \n968 | \n2023 | \nNaN | \n[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... | \n
317 rows × 10 columns
\n\n | appln_id | \nappln_auth | \nappln_nr | \nappln_kind | \nappln_filing_date | \nappln_filing_year | \nappln_nr_original | \nipr_type | \nreceiving_office | \ninternat_appln_id | \n... | \nearliest_pat_publn_id | \ngranted | \ndocdb_family_id | \ninpadoc_family_id | \ndocdb_family_size | \nnb_citing_docdb_fam | \nnb_applicants | \nnb_inventors | \nappln_title_lg | \nappln_title | \n
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n340657036 | \nEP | \n12000117 | \nA | \n2012-01-09 | \n2012 | \n12000117 | \nPI | \n\n | 0 | \n... | \n407623142 | \nY | \n45507394 | \n340657036 | \n3 | \n6 | \n1 | \n2 | \nen | \nRotating membrane filter disc apparatus | \n
1 | \n340982410 | \nEP | \n12151915 | \nA | \n2012-01-20 | \n2012 | \n12151915 | \nPI | \n\n | 0 | \n... | \n365158710 | \nY | \n45531220 | \n340982410 | \n2 | \n16 | \n2 | \n6 | \nen | \nHeating-Cooling-Capacity measurement controlli... | \n
2 | \n341078960 | \nEP | \n12700310 | \nA | \n2012-01-11 | \n2012 | \n12700310 | \nPI | \n\n | 340778427 | \n... | \n413564969 | \nY | \n45491582 | \n340778427 | \n3 | \n2 | \n1 | \n1 | \nen | \nTRANSMISSION DEVICE | \n
3 | \n341078962 | \nEP | \n12700311 | \nA | \n2012-01-11 | \n2012 | \n12700311 | \nPI | \n\n | 340778431 | \n... | \n413564970 | \nY | \n45491583 | \n340778431 | \n3 | \n3 | \n1 | \n1 | \nen | \nTRANSMISSION DEVICE | \n
4 | \n341127772 | \nEP | \n12700372 | \nA | \n2012-01-02 | \n2012 | \n12700372 | \nPI | \n\n | 340460188 | \n... | \n421840120 | \nY | \n45495923 | \n340460188 | \n4 | \n8 | \n1 | \n2 | \nen | \nPOWER CONTROL IN A WIRELESS COMMUNICATION SYST... | \n
5 rows × 28 columns
\n