You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
blabla/PATSTAT/patstat_filter.ipynb

2771 lines
82 KiB
Plaintext

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 109 ms\n",
"Wall time: 114 ms\n"
]
}
],
"source": [
"%%time\n",
"import dask"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<dask.config.set at 0x184a38e5220>"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dask.config.set(temporary_directory=r'D:\\PATSTAT\\dask_temp')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<dask.config.set at 0x184a3911190>"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dask.config.set({'temporary_directory': r'D:\\PATSTAT\\dask_temp'})"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'temporary-directory': 'D:\\\\PATSTAT\\\\dask_temp',\n",
" 'visualization': {'engine': None},\n",
" 'tokenize': {'ensure-deterministic': False},\n",
" 'dataframe': {'backend': 'pandas',\n",
" 'shuffle': {'method': None, 'compression': None},\n",
" 'parquet': {'metadata-task-size-local': 512, 'metadata-task-size-remote': 1},\n",
" 'dtype_backend': 'pandas',\n",
" 'convert_string': False},\n",
" 'array': {'backend': 'numpy',\n",
" 'rechunk': {'method': 'tasks'},\n",
" 'svg': {'size': 120},\n",
" 'slicing': {'split-large-chunks': None}},\n",
" 'optimization': {'annotations': {'fuse': True},\n",
" 'fuse': {'active': None,\n",
" 'ave-width': 1,\n",
" 'max-width': None,\n",
" 'max-height': inf,\n",
" 'max-depth-new-edges': None,\n",
" 'subgraphs': None,\n",
" 'rename-keys': True}}}"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dask.config.config"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import dask.dataframe as dd\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\PATSTAT\n",
"D:\\PATSTAT\n"
]
}
],
"source": [
"import os\n",
"print(os.getcwd()) # Prints the current working directory\n",
"\n",
"workdir_path=r\"D:\\PATSTAT\"\n",
"os.chdir(workdir_path)\n",
"print(os.getcwd())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# tls_201 = dd.read_csv(\"table_tls201.csv\", low_memory=False,dtype={'appln_nr': 'object',\n",
"# 'appln_nr_original': 'object'})\n",
"# tls_201.head()\n",
"# tls_206 = dd.read_csv(\"table_tls206.csv\", low_memory=False)\n",
"# tls_206.head()\n",
"# tls_207 = dd.read_csv(\"table_tls207.csv\", low_memory=False)\n",
"# tls_207.head()\n",
"# tls_207.to_parquet(\"tls_207.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>appln_id</th>\n",
" <th>appln_auth</th>\n",
" <th>appln_nr</th>\n",
" <th>appln_kind</th>\n",
" <th>appln_filing_date</th>\n",
" <th>appln_filing_year</th>\n",
" <th>appln_nr_original</th>\n",
" <th>ipr_type</th>\n",
" <th>receiving_office</th>\n",
" <th>internat_appln_id</th>\n",
" <th>...</th>\n",
" <th>earliest_publn_date</th>\n",
" <th>earliest_publn_year</th>\n",
" <th>earliest_pat_publn_id</th>\n",
" <th>granted</th>\n",
" <th>docdb_family_id</th>\n",
" <th>inpadoc_family_id</th>\n",
" <th>docdb_family_size</th>\n",
" <th>nb_citing_docdb_fam</th>\n",
" <th>nb_applicants</th>\n",
" <th>nb_inventors</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>XX</td>\n",
" <td>None</td>\n",
" <td>D</td>\n",
" <td>9999-12-31</td>\n",
" <td>9999</td>\n",
" <td>None</td>\n",
" <td>PI</td>\n",
" <td></td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>9999-12-31</td>\n",
" <td>9999</td>\n",
" <td>0</td>\n",
" <td>N</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>EP</td>\n",
" <td>103094.0</td>\n",
" <td>A</td>\n",
" <td>2000-02-15</td>\n",
" <td>2000</td>\n",
" <td>00103094</td>\n",
" <td>PI</td>\n",
" <td></td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>2000-09-20</td>\n",
" <td>2000</td>\n",
" <td>293253293</td>\n",
" <td>Y</td>\n",
" <td>8554171</td>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>79</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>EP</td>\n",
" <td>107845.0</td>\n",
" <td>A</td>\n",
" <td>1992-12-02</td>\n",
" <td>1992</td>\n",
" <td>00107845</td>\n",
" <td>PI</td>\n",
" <td></td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>2000-08-02</td>\n",
" <td>2000</td>\n",
" <td>301548848</td>\n",
" <td>Y</td>\n",
" <td>27517085</td>\n",
" <td>2</td>\n",
" <td>8</td>\n",
" <td>56</td>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>EP</td>\n",
" <td>202556.0</td>\n",
" <td>A</td>\n",
" <td>2000-07-17</td>\n",
" <td>2000</td>\n",
" <td>00202556</td>\n",
" <td>PI</td>\n",
" <td></td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>2001-01-24</td>\n",
" <td>2001</td>\n",
" <td>291964096</td>\n",
" <td>N</td>\n",
" <td>7915918</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>22</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>EP</td>\n",
" <td>300208.0</td>\n",
" <td>A</td>\n",
" <td>2000-01-13</td>\n",
" <td>2000</td>\n",
" <td>00300208</td>\n",
" <td>PI</td>\n",
" <td></td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>2000-07-26</td>\n",
" <td>2000</td>\n",
" <td>292901055</td>\n",
" <td>Y</td>\n",
" <td>22889365</td>\n",
" <td>4</td>\n",
" <td>6</td>\n",
" <td>27</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 26 columns</p>\n",
"</div>"
],
"text/plain": [
" appln_id appln_auth appln_nr appln_kind appln_filing_date \n",
"0 0 XX None D 9999-12-31 \\\n",
"1 1 EP 103094.0 A 2000-02-15 \n",
"2 2 EP 107845.0 A 1992-12-02 \n",
"3 3 EP 202556.0 A 2000-07-17 \n",
"4 4 EP 300208.0 A 2000-01-13 \n",
"\n",
" appln_filing_year appln_nr_original ipr_type receiving_office \n",
"0 9999 None PI \\\n",
"1 2000 00103094 PI \n",
"2 1992 00107845 PI \n",
"3 2000 00202556 PI \n",
"4 2000 00300208 PI \n",
"\n",
" internat_appln_id ... earliest_publn_date earliest_publn_year \n",
"0 0 ... 9999-12-31 9999 \\\n",
"1 0 ... 2000-09-20 2000 \n",
"2 0 ... 2000-08-02 2000 \n",
"3 0 ... 2001-01-24 2001 \n",
"4 0 ... 2000-07-26 2000 \n",
"\n",
" earliest_pat_publn_id granted docdb_family_id inpadoc_family_id \n",
"0 0 N 0 0 \\\n",
"1 293253293 Y 8554171 1 \n",
"2 301548848 Y 27517085 2 \n",
"3 291964096 N 7915918 3 \n",
"4 292901055 Y 22889365 4 \n",
"\n",
" docdb_family_size nb_citing_docdb_fam nb_applicants nb_inventors \n",
"0 1 0 0 0 \n",
"1 6 79 1 4 \n",
"2 8 56 2 6 \n",
"3 4 22 2 3 \n",
"4 6 27 1 2 \n",
"\n",
"[5 rows x 26 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Application data\n",
"tls_201_p = dd.read_parquet(\"tls_201.parquet\")\n",
"tls_201_p.head()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# tls_201_p[((tls_201_p[\"appln_filing_year\"]>2011)&\n",
"# (tls_201_p[\"appln_filing_year\"]<2024)&\n",
"# (tls_201_p[\"granted\"]==\"Y\"))][\"appln_id\"].nunique().compute()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# tls_201_p[((tls_201_p[\"appln_filing_year\"]>2011)&\n",
"# (tls_201_p[\"appln_filing_year\"]<2024)&\n",
"# (tls_201_p[\"granted\"]==\"N\"))][\"appln_id\"].nunique().compute()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>person_id</th>\n",
" <th>person_name</th>\n",
" <th>person_ctry_code</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Nokia Corporation</td>\n",
" <td>FI</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Lipponen, Markku</td>\n",
" <td>FI</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Laitinen, Timo</td>\n",
" <td>FI</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Aho, Ari</td>\n",
" <td>FI</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Knuutila, Jarno</td>\n",
" <td>FI</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" person_id person_name person_ctry_code\n",
"0 1 Nokia Corporation FI\n",
"1 2 Lipponen, Markku FI\n",
"2 3 Laitinen, Timo FI\n",
"3 4 Aho, Ari FI\n",
"4 5 Knuutila, Jarno FI"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tls_206_p = dd.read_parquet(\"tls_206.parquet\",columns=[\"person_id\",\"person_name\",\"person_ctry_code\"])\n",
"tls_206_p.head()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>person_id</th>\n",
" <th>appln_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>46</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>775</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>1192</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" person_id appln_id\n",
"0 1 1\n",
"1 1 7\n",
"2 1 46\n",
"3 1 775\n",
"4 1 1192"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tls_207_p = dd.read_parquet(\"tls_207.parquet\",columns=[\"person_id\",\"appln_id\"])\n",
"tls_207_p.head()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# tls_207_p[tls_207_p[\"appln_id\"]==1].compute()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ctry_code</th>\n",
" <th>iso_alpha3</th>\n",
" <th>st3_name</th>\n",
" <th>organisation_flag</th>\n",
" <th>continent</th>\n",
" <th>eu_member</th>\n",
" <th>epo_member</th>\n",
" <th>oecd_member</th>\n",
" <th>discontinued</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td>unknown</td>\n",
" <td>Y</td>\n",
" <td>NaN</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AD</td>\n",
" <td>AND</td>\n",
" <td>Andorra</td>\n",
" <td></td>\n",
" <td>Europe</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AE</td>\n",
" <td>ARE</td>\n",
" <td>United Arab Emirates</td>\n",
" <td></td>\n",
" <td>Asia</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AF</td>\n",
" <td>AFG</td>\n",
" <td>Afghanistan</td>\n",
" <td></td>\n",
" <td>Asia</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AG</td>\n",
" <td>ATG</td>\n",
" <td>Antigua and Barbuda</td>\n",
" <td></td>\n",
" <td>North America</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ctry_code iso_alpha3 st3_name organisation_flag continent \n",
"0 unknown Y NaN \\\n",
"1 AD AND Andorra Europe \n",
"2 AE ARE United Arab Emirates Asia \n",
"3 AF AFG Afghanistan Asia \n",
"4 AG ATG Antigua and Barbuda North America \n",
"\n",
" eu_member epo_member oecd_member discontinued \n",
"0 \n",
"1 \n",
"2 \n",
"3 \n",
"4 "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tls_801 = dd.read_csv(\"table_tls801.csv\", low_memory=False)\n",
"tls_801.head()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ctry_code</th>\n",
" <th>iso_alpha3</th>\n",
" <th>st3_name</th>\n",
" <th>organisation_flag</th>\n",
" <th>continent</th>\n",
" <th>eu_member</th>\n",
" <th>epo_member</th>\n",
" <th>oecd_member</th>\n",
" <th>discontinued</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>47</th>\n",
" <td>CN</td>\n",
" <td>CHN</td>\n",
" <td>China</td>\n",
" <td></td>\n",
" <td>Asia</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>HK</td>\n",
" <td>HKG</td>\n",
" <td>Hong Kong, China</td>\n",
" <td></td>\n",
" <td>Asia</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>147</th>\n",
" <td>MO</td>\n",
" <td>MAC</td>\n",
" <td>Macao SAR (China)</td>\n",
" <td></td>\n",
" <td>Asia</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>217</th>\n",
" <td>TW</td>\n",
" <td>TWN</td>\n",
" <td>Taiwan Province Of China</td>\n",
" <td></td>\n",
" <td>Asia</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ctry_code iso_alpha3 st3_name organisation_flag \n",
"47 CN CHN China \\\n",
"97 HK HKG Hong Kong, China \n",
"147 MO MAC Macao SAR (China) \n",
"217 TW TWN Taiwan Province Of China \n",
"\n",
" continent eu_member epo_member oecd_member discontinued \n",
"47 Asia \n",
"97 Asia \n",
"147 Asia \n",
"217 Asia "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"china_df = tls_801[tls_801.st3_name.str.lower().str.contains(\"china\")].compute()\n",
"china_df"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ctry_code</th>\n",
" <th>iso_alpha3</th>\n",
" <th>st3_name</th>\n",
" <th>organisation_flag</th>\n",
" <th>continent</th>\n",
" <th>eu_member</th>\n",
" <th>epo_member</th>\n",
" <th>oecd_member</th>\n",
" <th>discontinued</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>AT</td>\n",
" <td>AUT</td>\n",
" <td>Austria</td>\n",
" <td></td>\n",
" <td>Europe</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>BE</td>\n",
" <td>BEL</td>\n",
" <td>Belgium</td>\n",
" <td></td>\n",
" <td>Europe</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>BG</td>\n",
" <td>BGR</td>\n",
" <td>Bulgaria</td>\n",
" <td></td>\n",
" <td>Europe</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>54</th>\n",
" <td>CY</td>\n",
" <td>CYP</td>\n",
" <td>Cyprus</td>\n",
" <td></td>\n",
" <td>Europe</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>55</th>\n",
" <td>CZ</td>\n",
" <td>CZE</td>\n",
" <td>Czechia</td>\n",
" <td></td>\n",
" <td>Europe</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>57</th>\n",
" <td>DE</td>\n",
" <td>DEU</td>\n",
" <td>Germany</td>\n",
" <td></td>\n",
" <td>Europe</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>59</th>\n",
" <td>DK</td>\n",
" <td>DNK</td>\n",
" <td>Denmark</td>\n",
" <td></td>\n",
" <td>Europe</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>66</th>\n",
" <td>EE</td>\n",
" <td>EST</td>\n",
" <td>Estonia</td>\n",
" <td></td>\n",
" <td>Europe</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>72</th>\n",
" <td>ES</td>\n",
" <td>ESP</td>\n",
" <td>Spain</td>\n",
" <td></td>\n",
" <td>Europe</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>75</th>\n",
" <td>FI</td>\n",
" <td>FIN</td>\n",
" <td>Finland</td>\n",
" <td></td>\n",
" <td>Europe</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>79</th>\n",
" <td>FR</td>\n",
" <td>FRA</td>\n",
" <td>France</td>\n",
" <td></td>\n",
" <td>Europe</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>92</th>\n",
" <td>GR</td>\n",
" <td>GRC</td>\n",
" <td>Greece</td>\n",
" <td></td>\n",
" <td>Europe</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>HR</td>\n",
" <td>HRV</td>\n",
" <td>Croatia</td>\n",
" <td></td>\n",
" <td>Europe</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>101</th>\n",
" <td>HU</td>\n",
" <td>HUN</td>\n",
" <td>Hungary</td>\n",
" <td></td>\n",
" <td>Europe</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>104</th>\n",
" <td>IE</td>\n",
" <td>IRL</td>\n",
" <td>Ireland</td>\n",
" <td></td>\n",
" <td>Europe</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>111</th>\n",
" <td>IT</td>\n",
" <td>ITA</td>\n",
" <td>Italy</td>\n",
" <td></td>\n",
" <td>Europe</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>134</th>\n",
" <td>LT</td>\n",
" <td>LTU</td>\n",
" <td>Lithuania</td>\n",
" <td></td>\n",
" <td>Europe</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>135</th>\n",
" <td>LU</td>\n",
" <td>LUX</td>\n",
" <td>Luxembourg</td>\n",
" <td></td>\n",
" <td>Europe</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>136</th>\n",
" <td>LV</td>\n",
" <td>LVA</td>\n",
" <td>Latvia</td>\n",
" <td></td>\n",
" <td>Europe</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>151</th>\n",
" <td>MT</td>\n",
" <td>MLT</td>\n",
" <td>Malta</td>\n",
" <td></td>\n",
" <td>Europe</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>162</th>\n",
" <td>NL</td>\n",
" <td>NLD</td>\n",
" <td>Netherlands</td>\n",
" <td></td>\n",
" <td>Europe</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>174</th>\n",
" <td>PL</td>\n",
" <td>POL</td>\n",
" <td>Poland</td>\n",
" <td></td>\n",
" <td>Europe</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>175</th>\n",
" <td>PT</td>\n",
" <td>PRT</td>\n",
" <td>Portugal</td>\n",
" <td></td>\n",
" <td>Europe</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>180</th>\n",
" <td>RO</td>\n",
" <td>ROU</td>\n",
" <td>Romania</td>\n",
" <td></td>\n",
" <td>Europe</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>188</th>\n",
" <td>SE</td>\n",
" <td>SWE</td>\n",
" <td>Sweden</td>\n",
" <td></td>\n",
" <td>Europe</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>191</th>\n",
" <td>SI</td>\n",
" <td>SVN</td>\n",
" <td>Slovenia</td>\n",
" <td></td>\n",
" <td>Europe</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>192</th>\n",
" <td>SK</td>\n",
" <td>SVK</td>\n",
" <td>Slovakia</td>\n",
" <td></td>\n",
" <td>Europe</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td>Y</td>\n",
" <td></td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ctry_code iso_alpha3 st3_name organisation_flag continent eu_member \n",
"12 AT AUT Austria Europe Y \\\n",
"19 BE BEL Belgium Europe Y \n",
"21 BG BGR Bulgaria Europe Y \n",
"54 CY CYP Cyprus Europe Y \n",
"55 CZ CZE Czechia Europe Y \n",
"57 DE DEU Germany Europe Y \n",
"59 DK DNK Denmark Europe Y \n",
"66 EE EST Estonia Europe Y \n",
"72 ES ESP Spain Europe Y \n",
"75 FI FIN Finland Europe Y \n",
"79 FR FRA France Europe Y \n",
"92 GR GRC Greece Europe Y \n",
"99 HR HRV Croatia Europe Y \n",
"101 HU HUN Hungary Europe Y \n",
"104 IE IRL Ireland Europe Y \n",
"111 IT ITA Italy Europe Y \n",
"134 LT LTU Lithuania Europe Y \n",
"135 LU LUX Luxembourg Europe Y \n",
"136 LV LVA Latvia Europe Y \n",
"151 MT MLT Malta Europe Y \n",
"162 NL NLD Netherlands Europe Y \n",
"174 PL POL Poland Europe Y \n",
"175 PT PRT Portugal Europe Y \n",
"180 RO ROU Romania Europe Y \n",
"188 SE SWE Sweden Europe Y \n",
"191 SI SVN Slovenia Europe Y \n",
"192 SK SVK Slovakia Europe Y \n",
"\n",
" epo_member oecd_member discontinued \n",
"12 Y Y \n",
"19 Y Y \n",
"21 Y \n",
"54 Y \n",
"55 Y Y \n",
"57 Y Y \n",
"59 Y Y \n",
"66 Y Y \n",
"72 Y Y \n",
"75 Y Y \n",
"79 Y Y \n",
"92 Y Y \n",
"99 Y \n",
"101 Y Y \n",
"104 Y Y \n",
"111 Y Y \n",
"134 Y Y \n",
"135 Y Y \n",
"136 Y Y \n",
"151 Y \n",
"162 Y Y \n",
"174 Y Y \n",
"175 Y Y \n",
"180 Y \n",
"188 Y Y \n",
"191 Y Y \n",
"192 Y Y "
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"eu_df = tls_801[tls_801.eu_member==\"Y\"].compute()\n",
"eu_df"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"ctry_list=list(china_df[\"ctry_code\"])+list(eu_df[\"ctry_code\"])"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 0 ns\n",
"Wall time: 7.98 ms\n"
]
}
],
"source": [
"%%time\n",
"tls_appln_interval = tls_201_p[((tls_201_p[\"appln_filing_year\"]>2011)&\n",
" (tls_201_p[\"appln_filing_year\"]<2024)&\n",
" (tls_201_p[\"granted\"]==\"Y\"))][\"appln_id\"].unique()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"tls_206_p_subgroup = tls_206_p[tls_206_p[\"person_ctry_code\"].isin(ctry_list)][[\"person_id\",\"person_ctry_code\"]]"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 44.7 s\n",
"Wall time: 25.5 s\n"
]
}
],
"source": [
"%%time\n",
"appln_pers = (tls_207_p[tls_207_p['appln_id'].isin(tls_appln_interval.compute())]\n",
" ).merge(\n",
" tls_206_p_subgroup,\n",
" on=\"person_id\",how=\"inner\")[[\"appln_id\",\"person_id\",\"person_ctry_code\"]].drop_duplicates()\n",
"\n",
"appln_pers = appln_pers[appln_pers[\"person_ctry_code\"].isin(ctry_list)].drop_duplicates()\n"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['appln_id', 'person_id', 'person_ctry_code'], dtype='object')"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"appln_pers.columns"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"appln_pers.to_parquet(\"appln_pers.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2156886"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"appln_pers[\"appln_id\"].nunique().compute()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"id_selector = dd.read_parquet(\"appln_pers.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>appln_id</th>\n",
" <th>person_id</th>\n",
" <th>person_ctry_code</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>511397485</td>\n",
" <td>63445371</td>\n",
" <td>BE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>451183947</td>\n",
" <td>63448688</td>\n",
" <td>BG</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>477962742</td>\n",
" <td>63474274</td>\n",
" <td>CN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>486088266</td>\n",
" <td>63484131</td>\n",
" <td>CN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>488085169</td>\n",
" <td>63484131</td>\n",
" <td>CN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" appln_id person_id person_ctry_code\n",
"0 511397485 63445371 BE\n",
"1 451183947 63448688 BG\n",
"2 477962742 63474274 CN\n",
"3 486088266 63484131 CN\n",
"4 488085169 63484131 CN"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"id_selector.head()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 3.91 s\n",
"Wall time: 3.94 s\n"
]
}
],
"source": [
"%%time\n",
"eu_id = id_selector[id_selector[\"person_ctry_code\"].isin(list(eu_df[\"ctry_code\"]))][\"appln_id\"].unique()\n",
"china_id = id_selector[id_selector[\"person_ctry_code\"].isin(list(china_df[\"ctry_code\"]))][\"appln_id\"].unique()\n",
" \n",
"common_id = id_selector[id_selector[\"appln_id\"].isin(eu_id.compute())&\n",
" id_selector[\"appln_id\"].isin(china_id.compute())]\n",
"\n",
"common_id.to_parquet(\"common_id_CHEU.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 734 ms\n",
"Wall time: 730 ms\n"
]
},
{
"data": {
"text/plain": [
"21420"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"common_id[\"appln_id\"].nunique().compute()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"filtered_ids = dd.read_parquet(\"common_id_CHEU.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>appln_id</th>\n",
" <th>person_id</th>\n",
" <th>person_ctry_code</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>59</th>\n",
" <td>456952666</td>\n",
" <td>63672357</td>\n",
" <td>CN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>79</th>\n",
" <td>459377855</td>\n",
" <td>63692722</td>\n",
" <td>CN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>445248720</td>\n",
" <td>63758608</td>\n",
" <td>CN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>163</th>\n",
" <td>412767507</td>\n",
" <td>63911789</td>\n",
" <td>CN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>312</th>\n",
" <td>486567878</td>\n",
" <td>64332187</td>\n",
" <td>NL</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" appln_id person_id person_ctry_code\n",
"59 456952666 63672357 CN\n",
"79 459377855 63692722 CN\n",
"99 445248720 63758608 CN\n",
"163 412767507 63911789 CN\n",
"312 486567878 64332187 NL"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"filtered_ids.head()"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>appln_id</th>\n",
" <th>person_id</th>\n",
" <th>person_ctry_code</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>5514</th>\n",
" <td>340657036</td>\n",
" <td>5111182</td>\n",
" <td>DK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16190</th>\n",
" <td>340657036</td>\n",
" <td>49115429</td>\n",
" <td>CN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21824</th>\n",
" <td>340657036</td>\n",
" <td>53334372</td>\n",
" <td>DK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18881</th>\n",
" <td>340982410</td>\n",
" <td>49119386</td>\n",
" <td>CN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5445</th>\n",
" <td>340982410</td>\n",
" <td>1949211</td>\n",
" <td>DK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15810</th>\n",
" <td>575399552</td>\n",
" <td>40791395</td>\n",
" <td>SE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17704</th>\n",
" <td>575399552</td>\n",
" <td>49019064</td>\n",
" <td>SE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12146</th>\n",
" <td>575399552</td>\n",
" <td>42040993</td>\n",
" <td>CN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6370</th>\n",
" <td>575406608</td>\n",
" <td>7075122</td>\n",
" <td>DE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11827</th>\n",
" <td>575406608</td>\n",
" <td>18911286</td>\n",
" <td>CN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>87653 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" appln_id person_id person_ctry_code\n",
"5514 340657036 5111182 DK\n",
"16190 340657036 49115429 CN\n",
"21824 340657036 53334372 DK\n",
"18881 340982410 49119386 CN\n",
"5445 340982410 1949211 DK\n",
"... ... ... ...\n",
"15810 575399552 40791395 SE\n",
"17704 575399552 49019064 SE\n",
"12146 575399552 42040993 CN\n",
"6370 575406608 7075122 DE\n",
"11827 575406608 18911286 CN\n",
"\n",
"[87653 rows x 3 columns]"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"filtered_ids.sort_values(by=\"appln_id\").compute()"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 15.6 ms\n",
"Wall time: 31.1 ms\n"
]
}
],
"source": [
"%%time\n",
"id_scope=filtered_ids[\"appln_id\"].unique().compute()\n",
"pers_id_scope=filtered_ids[\"person_id\"].unique().compute()"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>appln_id</th>\n",
" <th>appln_auth</th>\n",
" <th>appln_nr</th>\n",
" <th>appln_kind</th>\n",
" <th>appln_filing_date</th>\n",
" <th>appln_filing_year</th>\n",
" <th>appln_nr_original</th>\n",
" <th>ipr_type</th>\n",
" <th>receiving_office</th>\n",
" <th>internat_appln_id</th>\n",
" <th>...</th>\n",
" <th>earliest_publn_date</th>\n",
" <th>earliest_publn_year</th>\n",
" <th>earliest_pat_publn_id</th>\n",
" <th>granted</th>\n",
" <th>docdb_family_id</th>\n",
" <th>inpadoc_family_id</th>\n",
" <th>docdb_family_size</th>\n",
" <th>nb_citing_docdb_fam</th>\n",
" <th>nb_applicants</th>\n",
" <th>nb_inventors</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>XX</td>\n",
" <td>None</td>\n",
" <td>D</td>\n",
" <td>9999-12-31</td>\n",
" <td>9999</td>\n",
" <td>None</td>\n",
" <td>PI</td>\n",
" <td></td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>9999-12-31</td>\n",
" <td>9999</td>\n",
" <td>0</td>\n",
" <td>N</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>EP</td>\n",
" <td>103094.0</td>\n",
" <td>A</td>\n",
" <td>2000-02-15</td>\n",
" <td>2000</td>\n",
" <td>00103094</td>\n",
" <td>PI</td>\n",
" <td></td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>2000-09-20</td>\n",
" <td>2000</td>\n",
" <td>293253293</td>\n",
" <td>Y</td>\n",
" <td>8554171</td>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>79</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>EP</td>\n",
" <td>107845.0</td>\n",
" <td>A</td>\n",
" <td>1992-12-02</td>\n",
" <td>1992</td>\n",
" <td>00107845</td>\n",
" <td>PI</td>\n",
" <td></td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>2000-08-02</td>\n",
" <td>2000</td>\n",
" <td>301548848</td>\n",
" <td>Y</td>\n",
" <td>27517085</td>\n",
" <td>2</td>\n",
" <td>8</td>\n",
" <td>56</td>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>EP</td>\n",
" <td>202556.0</td>\n",
" <td>A</td>\n",
" <td>2000-07-17</td>\n",
" <td>2000</td>\n",
" <td>00202556</td>\n",
" <td>PI</td>\n",
" <td></td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>2001-01-24</td>\n",
" <td>2001</td>\n",
" <td>291964096</td>\n",
" <td>N</td>\n",
" <td>7915918</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>22</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>EP</td>\n",
" <td>300208.0</td>\n",
" <td>A</td>\n",
" <td>2000-01-13</td>\n",
" <td>2000</td>\n",
" <td>00300208</td>\n",
" <td>PI</td>\n",
" <td></td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>2000-07-26</td>\n",
" <td>2000</td>\n",
" <td>292901055</td>\n",
" <td>Y</td>\n",
" <td>22889365</td>\n",
" <td>4</td>\n",
" <td>6</td>\n",
" <td>27</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 26 columns</p>\n",
"</div>"
],
"text/plain": [
" appln_id appln_auth appln_nr appln_kind appln_filing_date \n",
"0 0 XX None D 9999-12-31 \\\n",
"1 1 EP 103094.0 A 2000-02-15 \n",
"2 2 EP 107845.0 A 1992-12-02 \n",
"3 3 EP 202556.0 A 2000-07-17 \n",
"4 4 EP 300208.0 A 2000-01-13 \n",
"\n",
" appln_filing_year appln_nr_original ipr_type receiving_office \n",
"0 9999 None PI \\\n",
"1 2000 00103094 PI \n",
"2 1992 00107845 PI \n",
"3 2000 00202556 PI \n",
"4 2000 00300208 PI \n",
"\n",
" internat_appln_id ... earliest_publn_date earliest_publn_year \n",
"0 0 ... 9999-12-31 9999 \\\n",
"1 0 ... 2000-09-20 2000 \n",
"2 0 ... 2000-08-02 2000 \n",
"3 0 ... 2001-01-24 2001 \n",
"4 0 ... 2000-07-26 2000 \n",
"\n",
" earliest_pat_publn_id granted docdb_family_id inpadoc_family_id \n",
"0 0 N 0 0 \\\n",
"1 293253293 Y 8554171 1 \n",
"2 301548848 Y 27517085 2 \n",
"3 291964096 N 7915918 3 \n",
"4 292901055 Y 22889365 4 \n",
"\n",
" docdb_family_size nb_citing_docdb_fam nb_applicants nb_inventors \n",
"0 1 0 0 0 \n",
"1 6 79 1 4 \n",
"2 8 56 2 6 \n",
"3 4 22 2 3 \n",
"4 6 27 1 2 \n",
"\n",
"[5 rows x 26 columns]"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tls_201_p.head()"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"outdir = \"EU_CH_scope\"\n",
"os.makedirs(outdir, exist_ok=True)"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 3min 53s\n",
"Wall time: 2min 28s\n"
]
}
],
"source": [
"%%time\n",
"#Application data\n",
"tls_201_p = dd.read_parquet(\"tls_201.parquet\")\n",
"tls_201_scope = tls_201_p[tls_201_p['appln_id'].isin(id_scope)]\n",
"tls_201_scope.compute().to_csv(f\"{outdir}/tls_201_scope.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 1min 12s\n",
"Wall time: 24.3 s\n"
]
}
],
"source": [
"%%time\n",
"#Person-appln data\n",
"tls_207_p = dd.read_parquet(\"tls_207.parquet\")\n",
"tls_207_scope = tls_207_p[((tls_207_p['person_id'].isin(pers_id_scope))&\n",
" (tls_207_p['appln_id'].isin(id_scope)))]\n",
"tls_207_scope.compute().to_csv(f\"{outdir}/tls_207_scope.csv\",index=False)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 4min 10s\n",
"Wall time: 3min 1s\n"
]
}
],
"source": [
"%%time\n",
"#Person data\n",
"tls_206_p = dd.read_parquet(\"tls_206.parquet\")\n",
"tls_206_scope = tls_206_p[tls_206_p['person_id'].isin(pers_id_scope)]\n",
"tls_206_scope.compute().to_csv(f\"{outdir}/tls_206_scope.csv\",index=False)"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 2min 5s\n",
"Wall time: 1min 26s\n"
]
}
],
"source": [
"%%time\n",
"#Application title data\n",
"tls_202_p = dd.read_csv(\"table_tls202.csv\")\n",
"tls_202_scope = tls_202_p[tls_202_p['appln_id'].isin(id_scope)]\n",
"tls_202_scope.compute().to_csv(f\"{outdir}/tls_202_scope.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 2min 15s\n",
"Wall time: 1min 11s\n"
]
}
],
"source": [
"%%time\n",
"#IPC data\n",
"tls_224_p = dd.read_csv(\"table_tls224.csv\")\n",
"tls_224_p_scope = tls_224_p[tls_224_p['appln_id'].isin(id_scope)]\n",
"tls_224_p_scope.compute().to_csv(f\"{outdir}/tls_224_scope.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 1.97 s\n",
"Wall time: 2.66 s\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>appln_id</th>\n",
" <th>appln_title_lg</th>\n",
" <th>appln_title</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>en</td>\n",
" <td>Method and means for using additional cards in...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>en</td>\n",
" <td>Production of anti-self antibodies from antibo...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>en</td>\n",
" <td>Scintillation radiation detector</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>en</td>\n",
" <td>Wire bonding to copper</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>en</td>\n",
" <td>Method of manufacturing electrodes for thin fi...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>6</td>\n",
" <td>en</td>\n",
" <td>System for automatically routing calls to call...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>7</td>\n",
" <td>en</td>\n",
" <td>MULTILAYER TELECOMMUNICATIONS NETWORK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>8</td>\n",
" <td>en</td>\n",
" <td>EXHAUST GAS PURIFYING CATALYST COMPOUND, CATAL...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>9</td>\n",
" <td>en</td>\n",
" <td>SPRAYHEAD WITH NOZZLES MADE BY BORING</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>10</td>\n",
" <td>en</td>\n",
" <td>Diphosphines</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" appln_id appln_title_lg appln_title\n",
"0 1 en Method and means for using additional cards in...\n",
"1 2 en Production of anti-self antibodies from antibo...\n",
"2 3 en Scintillation radiation detector\n",
"3 4 en Wire bonding to copper\n",
"4 5 en Method of manufacturing electrodes for thin fi...\n",
"5 6 en System for automatically routing calls to call...\n",
"6 7 en MULTILAYER TELECOMMUNICATIONS NETWORK\n",
"7 8 en EXHAUST GAS PURIFYING CATALYST COMPOUND, CATAL...\n",
"8 9 en SPRAYHEAD WITH NOZZLES MADE BY BORING\n",
"9 10 en Diphosphines"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"tls_202 = dd.read_csv(\"table_tls202.csv\", low_memory=False)\n",
"tls_202.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>appln_id</th>\n",
" <th>tech_rel_appln_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>8</td>\n",
" <td>31308284</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>159</td>\n",
" <td>48940218</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>159</td>\n",
" <td>51019180</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>159</td>\n",
" <td>51022388</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>159</td>\n",
" <td>51022511</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" appln_id tech_rel_appln_id\n",
"0 8 31308284\n",
"1 159 48940218\n",
"2 159 51019180\n",
"3 159 51022388\n",
"4 159 51022511"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tls_205 = dd.read_csv(\"table_tls205.csv\", low_memory=False)\n",
"tls_205.head()"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"# Countries"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 COMPANY\n",
"1 NaN\n",
"2 GOV NON-PROFIT\n",
"3 INDIVIDUAL\n",
"4 UNIVERSITY\n",
"5 UNKNOWN\n",
"6 GOV NON-PROFIT UNIVERSITY\n",
"7 COMPANY GOV NON-PROFIT\n",
"8 HOSPITAL\n",
"9 COMPANY HOSPITAL\n",
"10 COMPANY UNIVERSITY\n",
"11 COMPANY GOV NON-PROFIT UNIVERSITY\n",
"12 COMPANY INDIVIDUAL\n",
"13 COMPANY GOV NON-PROFIT \n",
"14 GOV NON-PROFIT HOSPITAL\n",
"Name: psn_sector, dtype: object"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tls_206[\"psn_sector\"].unique().compute()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>person_orig_id</th>\n",
" <th>person_id</th>\n",
" <th>source</th>\n",
" <th>source_version</th>\n",
" <th>name_freeform</th>\n",
" <th>person_name_orig_lg</th>\n",
" <th>last_name</th>\n",
" <th>first_name</th>\n",
" <th>middle_name</th>\n",
" <th>address_freeform</th>\n",
" <th>...</th>\n",
" <th>address_3</th>\n",
" <th>address_4</th>\n",
" <th>address_5</th>\n",
" <th>street</th>\n",
" <th>city</th>\n",
" <th>zip_code</th>\n",
" <th>state</th>\n",
" <th>person_ctry_code</th>\n",
" <th>residence_ctry_code</th>\n",
" <th>role</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>EPREG</td>\n",
" <td>NaN</td>\n",
" <td>Nokia Corporation</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td></td>\n",
" <td>FI</td>\n",
" <td></td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>EPREG</td>\n",
" <td>NaN</td>\n",
" <td>Lipponen, Markku</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td></td>\n",
" <td>FI</td>\n",
" <td></td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>EPREG</td>\n",
" <td>NaN</td>\n",
" <td>Laitinen, Timo</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td></td>\n",
" <td>FI</td>\n",
" <td></td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>EPREG</td>\n",
" <td>NaN</td>\n",
" <td>Aho, Ari</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td></td>\n",
" <td>FI</td>\n",
" <td></td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>EPREG</td>\n",
" <td>NaN</td>\n",
" <td>Knuutila, Jarno</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td></td>\n",
" <td>FI</td>\n",
" <td></td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 22 columns</p>\n",
"</div>"
],
"text/plain": [
" person_orig_id person_id source source_version name_freeform \n",
"0 1 1 EPREG NaN Nokia Corporation \\\n",
"1 2 2 EPREG NaN Lipponen, Markku \n",
"2 3 3 EPREG NaN Laitinen, Timo \n",
"3 4 4 EPREG NaN Aho, Ari \n",
"4 5 5 EPREG NaN Knuutila, Jarno \n",
"\n",
" person_name_orig_lg last_name first_name middle_name address_freeform \n",
"0 NaN NaN NaN NaN NaN \\\n",
"1 NaN NaN NaN NaN NaN \n",
"2 NaN NaN NaN NaN NaN \n",
"3 NaN NaN NaN NaN NaN \n",
"4 NaN NaN NaN NaN NaN \n",
"\n",
" ... address_3 address_4 address_5 street city zip_code state \n",
"0 ... NaN NaN NaN NaN NaN NaN \\\n",
"1 ... NaN NaN NaN NaN NaN NaN \n",
"2 ... NaN NaN NaN NaN NaN NaN \n",
"3 ... NaN NaN NaN NaN NaN NaN \n",
"4 ... NaN NaN NaN NaN NaN NaN \n",
"\n",
" person_ctry_code residence_ctry_code role \n",
"0 FI NaN \n",
"1 FI NaN \n",
"2 FI NaN \n",
"3 FI NaN \n",
"4 FI NaN \n",
"\n",
"[5 rows x 22 columns]"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tls_226 = dd.read_csv(\"table_tls226.csv\", low_memory=False)\n",
"tls_226.head()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"137"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tls_226.npartitions"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>appln_id</th>\n",
" <th>nat_class_symbol</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>379100</td>\n",
" <td>G2J JGFG JGFG</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>448592</td>\n",
" <td>NOT CLASSIFIED</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>448593</td>\n",
" <td>NOT CLASSIFIED</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>448594</td>\n",
" <td>NOT CLASSIFIED</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>448595</td>\n",
" <td>NOT CLASSIFIED</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" appln_id nat_class_symbol\n",
"0 379100 G2J JGFG JGFG\n",
"1 448592 NOT CLASSIFIED \n",
"2 448593 NOT CLASSIFIED \n",
"3 448594 NOT CLASSIFIED \n",
"4 448595 NOT CLASSIFIED "
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tls_210 = dd.read_csv(\"table_tls210.csv\", low_memory=False)\n",
"tls_210.head()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"df = tls_210[tls_210.appln_id == 448594] # Select a subsection\n",
"result = df#.groupby('id').value.mean() # Reduce to a smaller size\n",
"result = result.compute()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>appln_id</th>\n",
" <th>nat_class_symbol</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>448594</td>\n",
" <td>NOT CLASSIFIED</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" appln_id nat_class_symbol\n",
"3 448594 NOT CLASSIFIED "
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 1
}