You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ZSI_Reconnect_China/PATSTAT/patstat_cpc_parse.ipynb

388 lines
52 KiB
Plaintext

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "code",
"execution_count": 44,
"id": "a8be6839",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
"import janitor\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from matplotlib.ticker import MaxNLocator\n",
"import math\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "211ba466",
"metadata": {},
"outputs": [],
"source": [
"outdir=\"WESTERN_CH_scope\"\n",
"\n",
"appln = pd.read_csv(f\"{outdir}/tls_201_scope.csv\")\n",
"\n",
"appln_title = pd.read_csv(f\"{outdir}/tls_202_scope.csv\")\n",
"\n",
"pers = pd.read_csv(f\"{outdir}/tls_206_scope.csv\")\n",
"\n",
"appln_pers = pd.read_csv(f\"{outdir}/tls_207_scope.csv\")\n",
"\n",
"appln_cpc = pd.read_csv(f\"{outdir}/tls_224_scope.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "f878b151",
"metadata": {},
"outputs": [],
"source": [
"# workdir_path=r\"CPCTitleList202302\"\n",
"# # outfile='wos_extract_complete.csv'\n",
"# # with_header=True\n",
"# cpc_ids = pd.DataFrame()\n",
"# for root, dirs, files in os.walk(workdir_path):\n",
"# for filename in files:\n",
"# path=os.path.join(root, filename)\n",
"# section = pd.read_csv(path, sep='\\t', header=None)\n",
"# cpc_ids=pd.concat([cpc_ids,section], ignore_index=True)\n",
"# cpc_ids.columns =[\"cpc_id\",\"idk\",\"cpc_name\"]\n",
"# cpc_ids = cpc_ids.drop(columns=\"idk\")"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "95ea20da",
"metadata": {},
"outputs": [],
"source": [
"parsed = {x: [] for x in ['code', 'title', 'section', 'class', 'subclass', 'group', 'main_group']}\n",
"for letter in 'ABCDEFGHY':\n",
" file = f'CPC_data/CPCTitleList202302/cpc-section-{letter}_20230201.txt'\n",
" with open(file) as f:\n",
" for line in f:\n",
" vals = line.strip().split('\\t')\n",
" if len(vals) == 2:\n",
" parsed['code'].append(vals[0])\n",
" parsed['title'].append(vals[1])\n",
" elif len(vals) == 3:\n",
" parsed['code'].append(vals[0])\n",
" parsed['title'].append(vals[2])\n",
"\n",
"\n",
"\n",
"for i in range(len(parsed['code'])):\n",
" code = parsed['code'][i]\n",
" main_group = code.split('/')[-1] if \"/\" in code else None\n",
" group = code.split('/')[0][4:] if len(code) >= 5 else None\n",
" subclass = code[3] if len(code) >= 4 else None\n",
" class_ = code[1:3] if len(code) >= 3 else None\n",
" section = code[0] if len(code) >= 1 else None\n",
" \n",
" parsed['main_group'].append(main_group)\n",
" parsed['group'].append(group)\n",
" parsed['subclass'].append(subclass)\n",
" parsed['class'].append(class_)\n",
" parsed['section'].append(section)\n",
"\n",
"cpc_ids2023 = pd.DataFrame.from_dict(parsed)\n",
"cpc_ids2023['cpc_version']=2023\n",
"cpc_ids2022 = pd.read_csv(\"CPC_data/cpc_titles_2022.csv\")\n",
"cpc_ids2022['cpc_version']=2022\n",
"cpc_ids = pd.concat([cpc_ids2023,cpc_ids2022], ignore_index=True)\n",
"cpc_ids = cpc_ids.rename(columns={\"code\":\"cpc_id\",\"title\":\"cpc_name\"}).drop_duplicates(subset=\"cpc_id\")"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "907d9c3e",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 48,
"id": "1be8971a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"70 cpc_ids not found\n",
"0.07344840249724569 % lost\n"
]
}
],
"source": [
"appln_cpc[\"cpc_id\"] = appln_cpc[\"cpc_class_symbol\"].str.replace(\" \",\"\")\n",
"appln_cpc_tax = appln_cpc.merge(cpc_ids, on=\"cpc_id\", how=\"left\")\n",
"\n",
"print(len(appln_cpc_tax[appln_cpc_tax[\"cpc_name\"].isna()][\"cpc_id\"].unique()), \"cpc_ids not found\")\n",
"print(len(appln_cpc_tax[appln_cpc_tax[\"cpc_name\"].isna()][\"cpc_id\"].unique())/len(appln_cpc_tax[\"cpc_id\"].unique())*100, \"% lost\")"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "b1274c34",
"metadata": {},
"outputs": [],
"source": [
"cpc_dict = dict(zip(cpc_ids.cpc_id.str.replace(\" \",\"\"), cpc_ids.cpc_name))\n",
"# cpc_dict"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "2a7e39ee",
"metadata": {},
"outputs": [],
"source": [
"def cpc_classifier(id_text):\n",
" taxonomy = []\n",
" iter_text = id_text.replace(\" \",\"\")\n",
" for i in range(len(iter_text)+1):\n",
" tax_id = iter_text[:i]\n",
" tax_name = cpc_dict.get(iter_text[:i])\n",
" if tax_name:\n",
" taxonomy.append((tax_id,tax_name))\n",
" return taxonomy\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "e31a013f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "[('A', 'HUMAN NECESSITIES'),\n ('A01',\n 'AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTING; TRAPPING; FISHING'),\n ('A01B',\n 'SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS, DETAILS, OR ACCESSORIES OF AGRICULTURAL MACHINES OR IMPLEMENTS, IN GENERAL (making or covering furrows or holes for sowing, planting, or manuring A01C5/00; soil working for engineering purposes E01, E02, E21; {measuring areas for agricultural purposes G01B})'),\n ('A01B1/06',\n 'Hoes; Hand cultivators {(rakes A01D7/00; forks A01D9/00; picks B25D)}'),\n ('A01B1/065', '{powered}')]"
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cpc_classifier(\"A01B1/065\")"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "f09a616c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " cpc_id cpc_name section class \n0 A HUMAN NECESSITIES A None \\\n1 A01 AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI... A 01 \n2 A01B SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS... A 01 \n3 A01B1/00 Hand tools (edge trimmers for lawns A01G3/06 ... A 01 \n4 A01B1/02 Spades; Shovels {(hand-operated dredgers E02F3... A 01 \n\n subclass group main_group cpc_version \n0 None None None 2023 \\\n1 None None None 2023 \n2 B None None 2023 \n3 B 1 00 2023 \n4 B 1 02 2023 \n\n version https://git-lfs.github.com/spec/v1 \n0 NaN \\\n1 NaN \n2 NaN \n3 NaN \n4 NaN \n\n cpc_taxonomy \n0 [(A, HUMAN NECESSITIES)] \n1 [(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... \n2 [(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... \n3 [(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... \n4 [(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO... ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>cpc_id</th>\n <th>cpc_name</th>\n <th>section</th>\n <th>class</th>\n <th>subclass</th>\n <th>group</th>\n <th>main_group</th>\n <th>cpc_version</th>\n <th>version https://git-lfs.github.com/spec/v1</th>\n <th>cpc_taxonomy</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>A</td>\n <td>HUMAN NECESSITIES</td>\n <td>A</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES)]</td>\n </tr>\n <tr>\n <th>1</th>\n <td>A01</td>\n <td>AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...</td>\n <td>A</td>\n <td>01</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>A01B</td>\n <td>SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS...</td>\n <td>A</td>\n <td>01</td>\n <td>B</td>\n <td>None</td>\n <td>None</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>A01B1/00</td>\n <td>Hand tools (edge trimmers for lawns A01G3/06 ...</td>\n <td>A</td>\n <td>01</td>\n <td>B</td>\n <td>1</td>\n <td>00</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>A01B1/02</td>\n <td>Spades; Shovels {(hand-operated dredgers E02F3...</td>\n <td>A</td>\n <td>01</td>\n <td>B</td>\n <td>1</td>\n <td>02</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A01, AGRICULTURE; FO...</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cpc_ids[\"cpc_taxonomy\"] = cpc_ids[\"cpc_id\"].fillna(\"\").map(cpc_classifier)\n",
"cpc_ids.head()\n"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "f3fa8bf3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"70 cpc_ids not found\n",
"0.07344840249724569 % lost\n"
]
}
],
"source": [
"appln_cpc[\"cpc_id\"] = appln_cpc[\"cpc_class_symbol\"].str.replace(\" \",\"\")\n",
"appln_cpc_tax = appln_cpc.merge(cpc_ids, on=\"cpc_id\", how=\"left\")\n",
"print(len(appln_cpc_tax[appln_cpc_tax[\"cpc_name\"].isna()][\"cpc_id\"].unique()), \"cpc_ids not found\")\n",
"print(len(appln_cpc_tax[appln_cpc_tax[\"cpc_name\"].isna()][\"cpc_id\"].unique())/len(appln_cpc_tax[\"cpc_id\"].unique())*100, \"% lost\")"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "58701721",
"metadata": {},
"outputs": [],
"source": [
"# cpc_ids\n",
"cpc_ids[\"cpc_fullname\"] = cpc_ids[\"cpc_taxonomy\"].apply(lambda x: \"<>\".join([y[1] for y in x]))\n",
"cpc_ids.sample(100)\n",
"colnames = [\"tax_level_\"+ str(i) for i in cpc_ids[\"cpc_fullname\"].str.split('<>', expand=True).columns]\n",
"cpc_ids[colnames] = cpc_ids[\"cpc_fullname\"].str.split('<>', expand=True)\n",
"# cpc_ids"
]
},
{
"cell_type": "markdown",
"id": "ca631acf",
"metadata": {},
"source": [
"## 'AI/Big Data' keywords"
]
},
{
"cell_type": "code",
"execution_count": 55,
"outputs": [
{
"data": {
"text/plain": "'neural network|machine learn|deep learn|remote sensing|convolutional neural|internet of things|feature extraction|genetic algorithm|big data|artificial intelligence|data driven|support vector machine|logistic regression not p=|optimization algorithm|principal component analysis|artificial neural network|swarm optimization|regularization|linear regression not p=|optimization algorithm|random forest|cloud computing|reinforcement learning|computer vision|kalman filter|image processing|data mining|evolutionary algorithm|edge computing|supervised learning|computational modeling|pattern recognition|image classification|long short-term memor|robotics|image segmentation|convex optimization|covariance matri|attention mechanism|markov chain|object detection not brain|clustering algorithm|recurrent neural network|data augmentation|transfer learning|adversarial network|decision tree|multi agent system|fuzzy set|convolutional network|image reconstruction|data analytic|smart grid|autoencoder|fuzzy logic|radial basis function|bayesian network|dimensionality reduction|face recognition not brain|gaussian process|anomaly detection|k-nearest neighbor|natural language processing|monte carlo method|large dataset|gradient descent|support vector regression|extreme learning machine|perceptron|model selection|ensemble learning|representation learning|recommender system|target tracking|singular value decomposition|feature learning|smart city|sentiment analy|markov decision process|k-means clustering|independent component analysis|brain computer interface|human-computer interaction|markov chain monte carlo|hierarchical clustering|semantic web|semi-supervised learning|human-robot interact|knowledge graph|speech recognition not brain|ensemble model|fog computing|mapreduce|evolutionary computation|data science|text mining|generative model|active learning|swarm intelligence|multi-task learning|language model|collaborative filtering|backpropagation|machine vision|computer-aided diagnosis|gated recurrent unit|lagrange multiplier|expert system|learning rate|hadoop|markov process|nonlinear optimization|learning system|self-organizing map|smart manufacturing|smart home|few shot learning|few-shot learning|meta-learning|meta learning|adversarial training|zero-shot learning|word embedding|expectation maximization algorithm|stochastic gradient descent|ridge regression|deep belief network|non-negative matrix factorization|affective computing|latent dirichlet allocation|kernel method|kernel learning|feature engineering|variational inference|image representation|manifold learning|adversarial example|knowledge distillation|time series forecast|variational autoencoder|lasso regression|smart energy|dbscan|multi-label classification|intelligent robot|ubiquitous computing|gaussian mixture models|smart technolog|boltzmann machine|smart buildings|predictive analytic|pervasive computing|smart agriculture|capsule network|human-in-the-loop|intelligent agent|ai applications|word vector|transformer model|facial recognition|unstructured data|restricted boltzmann machine|albert|lifelong learning|autonomous agents|chatbot|cholesky decomposition|nosql|nosql|explainable ai|seq2seq|probabilistic graphical model|qr decomposition|unsupervised deep learning|data warehouse|quantum machine learning|continual learning|smart environment|multimodal learning|smart health|artificial immune system|swarm robotics|kernel machine|latent factor model|eigendecomposition|adversarial machine|adversarial machine learning|smart mobility|sequence-to-sequence model|eigen decomposition|adversarial robustness|smart parking|adversarial neural|roberta|bidirectional encoder representations from transformer|locally linear embedding|hebbian learning|one-shot learning|multimodal representation|smart tourism|entity extraction|adaptive moment estimation|ontology learning|topic modeling|relational database'"
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"keywords_oklist_source= r'..\\WOS\\kw_token_ranked_bibliometrics_okset.xlsx'\n",
"keyword_df = pd.read_excel(keywords_oklist_source)\n",
"keywords = keyword_df[keyword_df[\"u_Priority (done)\"].isin([\"High\",\"Medium\"])][\"kw_token\"].str.replace('\"','').tolist()\n",
"keywords = [kw.replace(\"*\",\"\").replace(\"$\",\"\").lower() for kw in keywords if (\"?\" not in kw and len(kw)>3)]\n",
"keywords = \"|\".join([kw for kw in keywords if kw not in [\"classifier\",\"clustering\",\"loss function\",'classification']]+[\"relational database\"])\n",
"keywords"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 55,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 56,
"outputs": [
{
"data": {
"text/plain": " cpc_id cpc_name \n12725 A61B1/000096 {using artificial intelligence} \\\n13764 A61B5/7264 {Classification of physiological signals or da... \n13897 A61B6/52 {Devices using data or image processing specia... \n14016 A61B8/52 {Devices using data or image processing specia... \n15252 A61B2018/0069 {using fuzzy logic} \n... ... ... \n250685 Y10S707/99946 Object-oriented database structure network \n250686 Y10S707/99947 Object-oriented database structure reference \n250687 Y10S707/99948 Application of database or data structure, e.g... \n250688 Y10S707/99951 File or database maintenance \n250703 Y10S715/968 interface for database querying and retrieval \n\n section class subclass group main_group cpc_version \n12725 A 61 B 1 000096 2023 \\\n13764 A 61 B 5 7264 2023 \n13897 A 61 B 6 52 2023 \n14016 A 61 B 8 52 2023 \n15252 A 61 B 2018 0069 2023 \n... ... ... ... ... ... ... \n250685 Y 10 S 707 99946 2023 \n250686 Y 10 S 707 99947 2023 \n250687 Y 10 S 707 99948 2023 \n250688 Y 10 S 707 99951 2023 \n250703 Y 10 S 715 968 2023 \n\n version https://git-lfs.github.com/spec/v1 \n12725 NaN \\\n13764 NaN \n13897 NaN \n14016 NaN \n15252 NaN \n... ... \n250685 NaN \n250686 NaN \n250687 NaN \n250688 NaN \n250703 NaN \n\n cpc_taxonomy \n12725 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \\\n13764 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n13897 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n14016 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n15252 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n... ... \n250685 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n250686 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n250687 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n250688 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n250703 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n\n cpc_fullname \n12725 HUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... \\\n13764 HUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... \n13897 HUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... \n14016 HUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... \n15252 HUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... \n... ... \n250685 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n250686 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n250687 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n250688 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n250703 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n\n tax_level_0 \n12725 HUMAN NECESSITIES \\\n13764 HUMAN NECESSITIES \n13897 HUMAN NECESSITIES \n14016 HUMAN NECESSITIES \n15252 HUMAN NECESSITIES \n... ... \n250685 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n250686 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n250687 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n250688 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n250703 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n\n tax_level_1 \n12725 MEDICAL OR VETERINARY SCIENCE; HYGIENE \\\n13764 MEDICAL OR VETERINARY SCIENCE; HYGIENE \n13897 MEDICAL OR VETERINARY SCIENCE; HYGIENE \n14016 MEDICAL OR VETERINARY SCIENCE; HYGIENE \n15252 MEDICAL OR VETERINARY SCIENCE; HYGIENE \n... ... \n250685 TECHNICAL SUBJECTS COVERED BY FORMER USPC \n250686 TECHNICAL SUBJECTS COVERED BY FORMER USPC \n250687 TECHNICAL SUBJECTS COVERED BY FORMER USPC \n250688 TECHNICAL SUBJECTS COVERED BY FORMER USPC \n250703 TECHNICAL SUBJECTS COVERED BY FORMER USPC \n\n tax_level_2 \n12725 DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ... \\\n13764 DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ... \n13897 DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ... \n14016 DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ... \n15252 DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ... \n... ... \n250685 TECHNICAL SUBJECTS COVERED BY FORMER USPC CROS... \n250686 TECHNICAL SUBJECTS COVERED BY FORMER USPC CROS... \n250687 TECHNICAL SUBJECTS COVERED BY FORMER USPC CROS... \n250688 TECHNICAL SUBJECTS COVERED BY FORMER USPC CROS... \n250703 TECHNICAL SUBJECTS COVERED BY FORMER USPC CROS... \n\n tax_level_3 \n12725 Instruments for performing medical examination... \\\n13764 {Signal processing specially adapted for physi... \n13897 {Devices using data or image processing specia... \n14016 {Devices using data or image processing specia... \n15252 {using fuzzy logic} \n... ... \n250685 Distributed or remote access \n250686 Distributed or remote access \n250687 Distributed or remote access \n250688 File or database maintenance \n250703 interface for database querying and retrieval \n\n tax_level_4 \n12725 {of image signals during a use of endoscope} \\\n13764 {using Wavelet transforms} \n13897 None \n14016 None \n15252 None \n... ... \n250685 Object-oriented database structure network \n250686 Object-oriented database structure reference \n250687 Application of database or data structure, e.g... \n250688 None \n250703 None \n\n tax_level_5 tax_level_6 \n12725 {using artificial intelligence} None \\\n13764 {Classification of physiological signals or da... None \n13897 None None \n14016 None None \n15252 None None \n... ... ... \n250685 None None \n250686 None None \n250687 None None \n250688 None None \n250703 None None \n\n tax_level_7 \n12725 None \n13764 None \n13897 None \n14016 None \n15252 None \n... ... \n250685 None \n250686 None \n250687 None \n250688 None \n250703 None \n\n[317 rows x 19 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>cpc_id</th>\n <th>cpc_name</th>\n <th>section</th>\n <th>class</th>\n <th>subclass</th>\n <th>group</th>\n <th>main_group</th>\n <th>cpc_version</th>\n <th>version https://git-lfs.github.com/spec/v1</th>\n <th>cpc_taxonomy</th>\n <th>cpc_fullname</th>\n <th>tax_level_0</th>\n <th>tax_level_1</th>\n <th>tax_level_2</th>\n <th>tax_level_3</th>\n <th>tax_level_4</th>\n <th>tax_level_5</th>\n <th>tax_level_6</th>\n <th>tax_level_7</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>12725</th>\n <td>A61B1/000096</td>\n <td>{using artificial intelligence}</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>1</td>\n <td>000096</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n <td>HUMAN NECESSITIES&lt;&gt;MEDICAL OR VETERINARY SCIEN...</td>\n <td>HUMAN NECESSITIES</td>\n <td>MEDICAL OR VETERINARY SCIENCE; HYGIENE</td>\n <td>DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ...</td>\n <td>Instruments for performing medical examination...</td>\n <td>{of image signals during a use of endoscope}</td>\n <td>{using artificial intelligence}</td>\n <td>None</td>\n <td>None</td>\n </tr>\n <tr>\n <th>13764</th>\n <td>A61B5/7264</td>\n <td>{Classification of physiological signals or da...</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>5</td>\n <td>7264</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n <td>HUMAN NECESSITIES&lt;&gt;MEDICAL OR VETERINARY SCIEN...</td>\n <td>HUMAN NECESSITIES</td>\n <td>MEDICAL OR VETERINARY SCIENCE; HYGIENE</td>\n <td>DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ...</td>\n <td>{Signal processing specially adapted for physi...</td>\n <td>{using Wavelet transforms}</td>\n <td>{Classification of physiological signals or da...</td>\n <td>None</td>\n <td>None</td>\n </tr>\n <tr>\n <th>13897</th>\n <td>A61B6/52</td>\n <td>{Devices using data or image processing specia...</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>6</td>\n <td>52</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n <td>HUMAN NECESSITIES&lt;&gt;MEDICAL OR VETERINARY SCIEN...</td>\n <td>HUMAN NECESSITIES</td>\n <td>MEDICAL OR VETERINARY SCIENCE; HYGIENE</td>\n <td>DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ...</td>\n <td>{Devices using data or image processing specia...</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n </tr>\n <tr>\n <th>14016</th>\n <td>A61B8/52</td>\n <td>{Devices using data or image processing specia...</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>8</td>\n <td>52</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n <td>HUMAN NECESSITIES&lt;&gt;MEDICAL OR VETERINARY SCIEN...</td>\n <td>HUMAN NECESSITIES</td>\n <td>MEDICAL OR VETERINARY SCIENCE; HYGIENE</td>\n <td>DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ...</td>\n <td>{Devices using data or image processing specia...</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n </tr>\n <tr>\n <th>15252</th>\n <td>A61B2018/0069</td>\n <td>{using fuzzy logic}</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>2018</td>\n <td>0069</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n <td>HUMAN NECESSITIES&lt;&gt;MEDICAL OR VETERINARY SCIEN...</td>\n <td>HUMAN NECESSITIES</td>\n <td>MEDICAL OR VETERINARY SCIENCE; HYGIENE</td>\n <td>DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ...</td>\n <td>{using fuzzy logic}</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>250685</th>\n <td>Y10S707/99946</td>\n <td>Object-oriented database structure network</td>\n <td>Y</td>\n <td>10</td>\n <td>S</td>\n <td>707</td>\n <td>99946</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE...</td>\n <td>GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME...</td>\n <td>GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME...</td>\n <td>TECHNICAL SUBJECTS COVERED BY FORMER USPC</td>\n <td>TECHNICAL SUBJECTS COVERED BY FORMER USPC CROS...</td>\n <td>Distributed or remote access</td>\n <td>Object-oriented database structure network</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n </tr>\n <tr>\n <th>250686</th>\n <td>Y10S707/99947</td>\n <td>Object-oriented database structure reference</td>\n <td>Y</td>\n <td>10</td>\n <td>S</td>\n <td>707</td>\n <td>99947</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE...</td>\n <td>GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME...</td>\n <td>GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME...</td>\n <td>TECHNICAL SUBJECTS COVERED BY FORMER USPC</td>\n <td>TECHNICAL SUBJECTS COVERED BY FORMER USPC CROS...</td>\n <td>Distributed or remote access</td>\n <td>Object-oriented database structure reference</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n </tr>\n <tr>\n <th>250687</th>\n <td>Y10S707/99948</td>\n <td>Application of database or data structure, e.g...</td>\n <td>Y</td>\n <td>10</td>\n <td>S</td>\n <td>707</td>\n <td>99948</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE...</td>\n <td>GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME...</td>\n <td>GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME...</td>\n <td>TECHNICAL SUBJECTS COVERED BY FORMER USPC</td>\n <td>TECHNICAL SUBJECTS COVERED BY FORMER USPC CROS...</td>\n <td>Distributed or remote access</td>\n <td>Application of database or data structure, e.g...</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n </tr>\n <tr>\n <th>250688</th>\n <td>Y10S707/99951</td>\n <td>File or database maintenance</td>\n <td>Y</td>\n <td>10</td>\n <td>S</td>\n <td>707</td>\n <td>99951</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE...</td>\n <td>GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME...</td>\n <td>GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME...</td>\n <td>TECHNICAL SUBJECTS COVERED BY FORMER USPC</td>\n <td>TECHNICAL SUBJECTS COVERED BY FORMER USPC CROS...</td>\n <td>File or database maintenance</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n </tr>\n <tr>\n <th>250703</th>\n <td>Y10S715/968</td>\n <td>interface for database querying and retrieval</td>\n <td>Y</td>\n <td>10</td>\n <td>S</td>\n <td>715</td>\n <td>968</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE...</td>\n <td>GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME...</td>\n <td>GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME...</td>\n <td>TECHNICAL SUBJECTS COVERED BY FORMER USPC</td>\n <td>TECHNICAL SUBJECTS COVERED BY FORMER USPC CROS...</td>\n <td>interface for database querying and retrieval</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n </tr>\n </tbody>\n</table>\n<p>317 rows × 19 columns</p>\n</div>"
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#dummy search\n",
"scope_df = cpc_ids[cpc_ids[\"cpc_name\"].str.lower().str.contains(\"machine learn|neural network|deep learn|deep network|artificial intel*| big data|database|recommender system|computer vision|image processing|language model|language processing|fuzzy logic|principal component|image classification|video classification\", regex=True, na=False)]\n",
"scope_df"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 57,
"outputs": [
{
"data": {
"text/plain": " cpc_id cpc_name \n12725 A61B1/000096 {using artificial intelligence} \\\n13746 A61B5/72 {Signal processing specially adapted for physi... \n13764 A61B5/7264 {Classification of physiological signals or da... \n13897 A61B6/52 {Devices using data or image processing specia... \n14016 A61B8/52 {Devices using data or image processing specia... \n... ... ... \n246159 Y10S128/924 using artificial intelligence \n246160 Y10S128/925 Neural network \n248454 Y10S323/909 Remote sensing \n250570 Y10S706/00 Data processing: artificial intelligence \n250571 Y10S706/90 Fuzzy logic \n\n section class subclass group main_group cpc_version \n12725 A 61 B 1 000096 2023 \\\n13746 A 61 B 5 72 2023 \n13764 A 61 B 5 7264 2023 \n13897 A 61 B 6 52 2023 \n14016 A 61 B 8 52 2023 \n... ... ... ... ... ... ... \n246159 Y 10 S 128 924 2023 \n246160 Y 10 S 128 925 2023 \n248454 Y 10 S 323 909 2023 \n250570 Y 10 S 706 00 2023 \n250571 Y 10 S 706 90 2023 \n\n version https://git-lfs.github.com/spec/v1 \n12725 NaN \\\n13746 NaN \n13764 NaN \n13897 NaN \n14016 NaN \n... ... \n246159 NaN \n246160 NaN \n248454 NaN \n250570 NaN \n250571 NaN \n\n cpc_taxonomy \n12725 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \\\n13746 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n13764 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n13897 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n14016 [(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE... \n... ... \n246159 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n246160 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n248454 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n250570 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n250571 [(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE... \n\n cpc_fullname \n12725 HUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... \\\n13746 HUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... \n13764 HUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... \n13897 HUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... \n14016 HUMAN NECESSITIES<>MEDICAL OR VETERINARY SCIEN... \n... ... \n246159 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n246160 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n248454 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n250570 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n250571 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n\n tax_level_0 \n12725 HUMAN NECESSITIES \\\n13746 HUMAN NECESSITIES \n13764 HUMAN NECESSITIES \n13897 HUMAN NECESSITIES \n14016 HUMAN NECESSITIES \n... ... \n246159 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n246160 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n248454 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n250570 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n250571 GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME... \n\n tax_level_1 \n12725 MEDICAL OR VETERINARY SCIENCE; HYGIENE \\\n13746 MEDICAL OR VETERINARY SCIENCE; HYGIENE \n13764 MEDICAL OR VETERINARY SCIENCE; HYGIENE \n13897 MEDICAL OR VETERINARY SCIENCE; HYGIENE \n14016 MEDICAL OR VETERINARY SCIENCE; HYGIENE \n... ... \n246159 TECHNICAL SUBJECTS COVERED BY FORMER USPC \n246160 TECHNICAL SUBJECTS COVERED BY FORMER USPC \n248454 TECHNICAL SUBJECTS COVERED BY FORMER USPC \n250570 TECHNICAL SUBJECTS COVERED BY FORMER USPC \n250571 TECHNICAL SUBJECTS COVERED BY FORMER USPC \n\n tax_level_2 \n12725 DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ... \\\n13746 DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ... \n13764 DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ... \n13897 DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ... \n14016 DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ... \n... ... \n246159 TECHNICAL SUBJECTS COVERED BY FORMER USPC CROS... \n246160 TECHNICAL SUBJECTS COVERED BY FORMER USPC CROS... \n248454 TECHNICAL SUBJECTS COVERED BY FORMER USPC CROS... \n250570 TECHNICAL SUBJECTS COVERED BY FORMER USPC CROS... \n250571 TECHNICAL SUBJECTS COVERED BY FORMER USPC CROS... \n\n tax_level_3 \n12725 Instruments for performing medical examination... \\\n13746 {Signal processing specially adapted for physi... \n13764 {Signal processing specially adapted for physi... \n13897 {Devices using data or image processing specia... \n14016 {Devices using data or image processing specia... \n... ... \n246159 Computer assisted medical diagnostics \n246160 Computer assisted medical diagnostics \n248454 Remote sensing \n250570 Data processing: artificial intelligence \n250571 Fuzzy logic \n\n tax_level_4 \n12725 {of image signals during a use of endoscope} \\\n13746 None \n13764 {using Wavelet transforms} \n13897 None \n14016 None \n... ... \n246159 using artificial intelligence \n246160 Neural network \n248454 None \n250570 None \n250571 None \n\n tax_level_5 tax_level_6 \n12725 {using artificial intelligence} None \\\n13746 None None \n13764 {Classification of physiological signals or da... None \n13897 None None \n14016 None None \n... ... ... \n246159 None None \n246160 None None \n248454 None None \n250570 None None \n250571 None None \n\n tax_level_7 \n12725 None \n13746 None \n13764 None \n13897 None \n14016 None \n... ... \n246159 None \n246160 None \n248454 None \n250570 None \n250571 None \n\n[358 rows x 19 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>cpc_id</th>\n <th>cpc_name</th>\n <th>section</th>\n <th>class</th>\n <th>subclass</th>\n <th>group</th>\n <th>main_group</th>\n <th>cpc_version</th>\n <th>version https://git-lfs.github.com/spec/v1</th>\n <th>cpc_taxonomy</th>\n <th>cpc_fullname</th>\n <th>tax_level_0</th>\n <th>tax_level_1</th>\n <th>tax_level_2</th>\n <th>tax_level_3</th>\n <th>tax_level_4</th>\n <th>tax_level_5</th>\n <th>tax_level_6</th>\n <th>tax_level_7</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>12725</th>\n <td>A61B1/000096</td>\n <td>{using artificial intelligence}</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>1</td>\n <td>000096</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n <td>HUMAN NECESSITIES&lt;&gt;MEDICAL OR VETERINARY SCIEN...</td>\n <td>HUMAN NECESSITIES</td>\n <td>MEDICAL OR VETERINARY SCIENCE; HYGIENE</td>\n <td>DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ...</td>\n <td>Instruments for performing medical examination...</td>\n <td>{of image signals during a use of endoscope}</td>\n <td>{using artificial intelligence}</td>\n <td>None</td>\n <td>None</td>\n </tr>\n <tr>\n <th>13746</th>\n <td>A61B5/72</td>\n <td>{Signal processing specially adapted for physi...</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>5</td>\n <td>72</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n <td>HUMAN NECESSITIES&lt;&gt;MEDICAL OR VETERINARY SCIEN...</td>\n <td>HUMAN NECESSITIES</td>\n <td>MEDICAL OR VETERINARY SCIENCE; HYGIENE</td>\n <td>DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ...</td>\n <td>{Signal processing specially adapted for physi...</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n </tr>\n <tr>\n <th>13764</th>\n <td>A61B5/7264</td>\n <td>{Classification of physiological signals or da...</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>5</td>\n <td>7264</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n <td>HUMAN NECESSITIES&lt;&gt;MEDICAL OR VETERINARY SCIEN...</td>\n <td>HUMAN NECESSITIES</td>\n <td>MEDICAL OR VETERINARY SCIENCE; HYGIENE</td>\n <td>DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ...</td>\n <td>{Signal processing specially adapted for physi...</td>\n <td>{using Wavelet transforms}</td>\n <td>{Classification of physiological signals or da...</td>\n <td>None</td>\n <td>None</td>\n </tr>\n <tr>\n <th>13897</th>\n <td>A61B6/52</td>\n <td>{Devices using data or image processing specia...</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>6</td>\n <td>52</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n <td>HUMAN NECESSITIES&lt;&gt;MEDICAL OR VETERINARY SCIEN...</td>\n <td>HUMAN NECESSITIES</td>\n <td>MEDICAL OR VETERINARY SCIENCE; HYGIENE</td>\n <td>DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ...</td>\n <td>{Devices using data or image processing specia...</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n </tr>\n <tr>\n <th>14016</th>\n <td>A61B8/52</td>\n <td>{Devices using data or image processing specia...</td>\n <td>A</td>\n <td>61</td>\n <td>B</td>\n <td>8</td>\n <td>52</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(A, HUMAN NECESSITIES), (A61, MEDICAL OR VETE...</td>\n <td>HUMAN NECESSITIES&lt;&gt;MEDICAL OR VETERINARY SCIEN...</td>\n <td>HUMAN NECESSITIES</td>\n <td>MEDICAL OR VETERINARY SCIENCE; HYGIENE</td>\n <td>DIAGNOSIS; SURGERY; IDENTIFICATION (analysing ...</td>\n <td>{Devices using data or image processing specia...</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>246159</th>\n <td>Y10S128/924</td>\n <td>using artificial intelligence</td>\n <td>Y</td>\n <td>10</td>\n <td>S</td>\n <td>128</td>\n <td>924</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE...</td>\n <td>GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME...</td>\n <td>GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME...</td>\n <td>TECHNICAL SUBJECTS COVERED BY FORMER USPC</td>\n <td>TECHNICAL SUBJECTS COVERED BY FORMER USPC CROS...</td>\n <td>Computer assisted medical diagnostics</td>\n <td>using artificial intelligence</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n </tr>\n <tr>\n <th>246160</th>\n <td>Y10S128/925</td>\n <td>Neural network</td>\n <td>Y</td>\n <td>10</td>\n <td>S</td>\n <td>128</td>\n <td>925</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE...</td>\n <td>GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME...</td>\n <td>GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME...</td>\n <td>TECHNICAL SUBJECTS COVERED BY FORMER USPC</td>\n <td>TECHNICAL SUBJECTS COVERED BY FORMER USPC CROS...</td>\n <td>Computer assisted medical diagnostics</td>\n <td>Neural network</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n </tr>\n <tr>\n <th>248454</th>\n <td>Y10S323/909</td>\n <td>Remote sensing</td>\n <td>Y</td>\n <td>10</td>\n <td>S</td>\n <td>323</td>\n <td>909</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE...</td>\n <td>GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME...</td>\n <td>GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME...</td>\n <td>TECHNICAL SUBJECTS COVERED BY FORMER USPC</td>\n <td>TECHNICAL SUBJECTS COVERED BY FORMER USPC CROS...</td>\n <td>Remote sensing</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n </tr>\n <tr>\n <th>250570</th>\n <td>Y10S706/00</td>\n <td>Data processing: artificial intelligence</td>\n <td>Y</td>\n <td>10</td>\n <td>S</td>\n <td>706</td>\n <td>00</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE...</td>\n <td>GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME...</td>\n <td>GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME...</td>\n <td>TECHNICAL SUBJECTS COVERED BY FORMER USPC</td>\n <td>TECHNICAL SUBJECTS COVERED BY FORMER USPC CROS...</td>\n <td>Data processing: artificial intelligence</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n </tr>\n <tr>\n <th>250571</th>\n <td>Y10S706/90</td>\n <td>Fuzzy logic</td>\n <td>Y</td>\n <td>10</td>\n <td>S</td>\n <td>706</td>\n <td>90</td>\n <td>2023</td>\n <td>NaN</td>\n <td>[(Y, GENERAL TAGGING OF NEW TECHNOLOGICAL DEVE...</td>\n <td>GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME...</td>\n <td>GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME...</td>\n <td>TECHNICAL SUBJECTS COVERED BY FORMER USPC</td>\n <td>TECHNICAL SUBJECTS COVERED BY FORMER USPC CROS...</td>\n <td>Fuzzy logic</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n <td>None</td>\n </tr>\n </tbody>\n</table>\n<p>358 rows × 19 columns</p>\n</div>"
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#wos keyword search\n",
"scope_df = cpc_ids[cpc_ids[\"cpc_name\"].str.lower().str.contains(keywords, regex=True, na=False)]\n",
"scope_df"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 58,
"id": "6c3baa5b",
"metadata": {},
"outputs": [],
"source": [
"scope_ids = scope_df[\"cpc_id\"].unique()\n",
"cpc_ids[\"data_scope\"] = cpc_ids[\"cpc_id\"].isin(scope_ids)\n",
"cpc_ids.dropna(axis=1, how='all')\n",
"cpc_ids.to_csv(f\"CPC_data/cpc_defs.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 59,
"outputs": [],
"source": [
"# cpc_ids"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 60,
"id": "2e8368b4",
"metadata": {},
"outputs": [],
"source": [
"# appln_data = appln.merge(appln_title, on=\"appln_id\")\n",
"# appln_data.head()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}