You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ZSI_Reconnect_China/PATSTAT/person_minipipe.ipynb

215 lines
16 KiB
Plaintext

This file contains invisible Unicode characters!

This file contains invisible Unicode characters that may be processed differently from what appears below. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to reveal hidden characters.

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"D:\\PATSTAT\n",
"D:\\PATSTAT\n"
]
}
],
"source": [
"\n",
"import dask\n",
"\n",
"dask.config.set(temporary_directory=r'D:\\PATSTAT\\dask_temp')\n",
"dask.config.set({'temporary_directory': r'D:\\PATSTAT\\dask_temp'})\n",
"dask.config.config\n",
"import dask.dataframe as dd\n",
"import os\n",
"\n",
"import os\n",
"print(os.getcwd()) # Prints the current working directory\n",
"\n",
"workdir_path=r\"D:\\PATSTAT\"\n",
"os.chdir(workdir_path)\n",
"print(os.getcwd())"
]
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [],
"source": [
"tls_206 = dd.read_csv(\"table_tls206.csv\", low_memory=False)\n",
"tls_206.to_parquet(\"tls_206.parquet\")\n",
"# %%time\n",
"#Person data\n",
"tls_206_p = dd.read_parquet(\"tls_206.parquet\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [],
"source": [
"tls_209 = dd.read_csv(\"table_tls209.csv\", low_memory=False)\n",
"tls_209.to_parquet(\"tls_209.parquet\")\n",
"# %%time\n",
"#Person data\n",
"tls_209_p = dd.read_parquet(\"tls_209.parquet\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [],
"source": [
"# import pandas as pd\n",
"# appln_pers_f = pd.read_csv(r\"C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\PATSTAT\\first_round\\first-filings-with-persons-raw.csv\", header=None,\n",
"# names=[\"appln_id\",\"appln_auth\",\"person_id\",\" invt_seq_nr\",\"applt_seq_nr\",'person_name',\"person_ctry_code\"])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 4,
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"outdir=\"raw_files_csv\"\n",
"appln_pers_f = pd.read_csv(r\"C:/Users/radvanyi/PycharmProjects/ZSI_analytics/PATSTAT/raw_files_csv/02_persons_2011_2022.csv\",low_memory=False)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [],
"source": [
"pers_id_scope = appln_pers_f[\"person_id\"].unique()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 6,
"outputs": [],
"source": [
"tls_206_scope = tls_206_p[tls_206_p['person_id'].isin(pers_id_scope)]\n",
"tls_206_scope.compute().to_csv(r\"C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\PATSTAT\\first_round\\tls_206_scope_v2.csv\",index=False)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [],
"source": [
"appln_id_scope = appln_pers_f[\"appln_id\"].unique()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [
{
"data": {
"text/plain": "12646904"
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(appln_id_scope)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 14,
"outputs": [],
"source": [
"tls_209_p_scope =tls_209_p[tls_209_p['appln_id'].isin(appln_id_scope)]\n",
"tls_209_p_scope.compute().to_csv(r\"C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\PATSTAT\\first_round\\tls_209_IPC_scope.csv\",index=False)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [],
"source": [
"df_206 = tls_206_scope.compute()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [
{
"data": {
"text/plain": " person_id person_name \n4023 4025 Meritor Heavy Vehicle Braking Systems (UK) Lim... \\\n4347 4349 Fraser, Stuart \n5627 5629 Xaar Technology Limited \n5811 5813 SIEMENS PLC \n6499 6501 BAE Systems PLC \n... ... ... \n366118 88823960 WARD, Lauren \n366119 88823961 WÄRTSILÄ UK LIMITED \n366130 88823972 Xavier Erdödy \n366135 88823977 Zeg.AI Ltd \n366137 88823979 Zhiyang Pan \n\n person_name_orig_lg \n4023 Meritor Heavy Vehicle Braking Systems (UK) Lim... \\\n4347 Fraser, Stuart \n5627 Xaar Technology Limited \n5811 SIEMENS PLC \n6499 BAE Systems PLC \n... ... \n366118 WARD, Lauren \n366119 WÄRTSILÄ UK LIMITED \n366130 Xavier Erdödy \n366135 Zeg.AI Ltd \n366137 Zhiyang Pan \n\n person_address person_ctry_code \n4023 Grange Road Cwmbran,Gwent NP44 3XU GB \\\n4347 Fernhill Lees Lane,Little Neston Cheshire CH64... GB \n5627 Unit 316, Science Park,Cambridge CB4 0XR GB \n5811 Faraday House Sir William Siemens Square Friml... GB \n6499 6 Carlton Gardens,London SW1Y 5AD GB \n... ... ... \n366118 None GB \n366119 None GB \n366130 None GB \n366135 None GB \n366137 None GB \n\n nuts nuts_level doc_std_name_id \n4023 UKL16 3 25273975 \\\n4347 UKD63 3 3738 \n5627 UKH12 3 4824 \n5811 UKJ25 3 4979 \n6499 UKI32 3 5583 \n... ... ... ... \n366118 UK 0 40301088 \n366119 UK 0 21929085 \n366130 UK 0 40578262 \n366135 UK 0 37017676 \n366137 UK 0 17409767 \n\n doc_std_name psn_id \n4023 MERITOR HEAVY VEHICLE BRAKING SYSTEMS UK LTD 21718818 \\\n4347 FRASER STUART 9243356 \n5627 XAAR TECHNOLOGY LTD 35706185 \n5811 SIEMENS PLC 30138991 \n6499 BAE SYSTEMS PLC 1787059 \n... ... ... \n366118 WARD LAUREN 188823960 \n366119 RTSIL UK LTD W 188823961 \n366130 XAVIER ERDÖDY 188823972 \n366135 ZEG AI LTD 188823977 \n366137 ZHIYANG PAN 188823979 \n\n psn_name psn_level psn_sector \n4023 MERITOR HEAVY VEHICLE BRAKING SYSTEMS (UK) 1 COMPANY \\\n4347 FRASER, STUART 0 None \n5627 XAAR TECHNOLOGY 1 COMPANY \n5811 SIEMENS 2 COMPANY \n6499 BAE SYSTEMS 2 COMPANY \n... ... ... ... \n366118 WARD, Lauren 0 UNKNOWN \n366119 WÄRTSILÄ UK LIMITED 0 UNKNOWN \n366130 Xavier Erdödy 0 UNKNOWN \n366135 Zeg.AI Ltd 0 UNKNOWN \n366137 Zhiyang Pan 0 UNKNOWN \n\n han_id han_name \n4023 1940089 MERITOR HEAVY VEHICLE BRAKING SYSTEMS UK LTD \\\n4347 100004349 Fraser, Stuart \n5627 3228426 XAAR TECH LTD \n5811 2755905 SIEMENS PLC \n6499 208539 BAE SYSTEMS PLC \n... ... ... \n366118 188823960 WARD, Lauren \n366119 188823961 WÄRTSILÄ UK LIMITED \n366130 188823972 Xavier Erdödy \n366135 188823977 Zeg.AI Ltd \n366137 188823979 Zhiyang Pan \n\n han_harmonized \n4023 2 \n4347 0 \n5627 2 \n5811 2 \n6499 2 \n... ... \n366118 0 \n366119 0 \n366130 0 \n366135 0 \n366137 0 \n\n[77303 rows x 16 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>person_id</th>\n <th>person_name</th>\n <th>person_name_orig_lg</th>\n <th>person_address</th>\n <th>person_ctry_code</th>\n <th>nuts</th>\n <th>nuts_level</th>\n <th>doc_std_name_id</th>\n <th>doc_std_name</th>\n <th>psn_id</th>\n <th>psn_name</th>\n <th>psn_level</th>\n <th>psn_sector</th>\n <th>han_id</th>\n <th>han_name</th>\n <th>han_harmonized</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>4023</th>\n <td>4025</td>\n <td>Meritor Heavy Vehicle Braking Systems (UK) Lim...</td>\n <td>Meritor Heavy Vehicle Braking Systems (UK) Lim...</td>\n <td>Grange Road Cwmbran,Gwent NP44 3XU</td>\n <td>GB</td>\n <td>UKL16</td>\n <td>3</td>\n <td>25273975</td>\n <td>MERITOR HEAVY VEHICLE BRAKING SYSTEMS UK LTD</td>\n <td>21718818</td>\n <td>MERITOR HEAVY VEHICLE BRAKING SYSTEMS (UK)</td>\n <td>1</td>\n <td>COMPANY</td>\n <td>1940089</td>\n <td>MERITOR HEAVY VEHICLE BRAKING SYSTEMS UK LTD</td>\n <td>2</td>\n </tr>\n <tr>\n <th>4347</th>\n <td>4349</td>\n <td>Fraser, Stuart</td>\n <td>Fraser, Stuart</td>\n <td>Fernhill Lees Lane,Little Neston Cheshire CH64...</td>\n <td>GB</td>\n <td>UKD63</td>\n <td>3</td>\n <td>3738</td>\n <td>FRASER STUART</td>\n <td>9243356</td>\n <td>FRASER, STUART</td>\n <td>0</td>\n <td>None</td>\n <td>100004349</td>\n <td>Fraser, Stuart</td>\n <td>0</td>\n </tr>\n <tr>\n <th>5627</th>\n <td>5629</td>\n <td>Xaar Technology Limited</td>\n <td>Xaar Technology Limited</td>\n <td>Unit 316, Science Park,Cambridge CB4 0XR</td>\n <td>GB</td>\n <td>UKH12</td>\n <td>3</td>\n <td>4824</td>\n <td>XAAR TECHNOLOGY LTD</td>\n <td>35706185</td>\n <td>XAAR TECHNOLOGY</td>\n <td>1</td>\n <td>COMPANY</td>\n <td>3228426</td>\n <td>XAAR TECH LTD</td>\n <td>2</td>\n </tr>\n <tr>\n <th>5811</th>\n <td>5813</td>\n <td>SIEMENS PLC</td>\n <td>SIEMENS PLC</td>\n <td>Faraday House Sir William Siemens Square Friml...</td>\n <td>GB</td>\n <td>UKJ25</td>\n <td>3</td>\n <td>4979</td>\n <td>SIEMENS PLC</td>\n <td>30138991</td>\n <td>SIEMENS</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>2755905</td>\n <td>SIEMENS PLC</td>\n <td>2</td>\n </tr>\n <tr>\n <th>6499</th>\n <td>6501</td>\n <td>BAE Systems PLC</td>\n <td>BAE Systems PLC</td>\n <td>6 Carlton Gardens,London SW1Y 5AD</td>\n <td>GB</td>\n <td>UKI32</td>\n <td>3</td>\n <td>5583</td>\n <td>BAE SYSTEMS PLC</td>\n <td>1787059</td>\n <td>BAE SYSTEMS</td>\n <td>2</td>\n <td>COMPANY</td>\n <td>208539</td>\n <td>BAE SYSTEMS PLC</td>\n <td>2</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>366118</th>\n <td>88823960</td>\n <td>WARD, Lauren</td>\n <td>WARD, Lauren</td>\n <td>None</td>\n <td>GB</td>\n <td>UK</td>\n <td>0</td>\n <td>40301088</td>\n <td>WARD LAUREN</td>\n <td>188823960</td>\n <td>WARD, Lauren</td>\n <td>0</td>\n <td>UNKNOWN</td>\n <td>188823960</td>\n <td>WARD, Lauren</td>\n <td>0</td>\n </tr>\n <tr>\n <th>366119</th>\n <td>88823961</td>\n <td>WÄRTSILÄ UK LIMITED</td>\n <td>WÄRTSILÄ UK LIMITED</td>\n <td>None</td>\n <td>GB</td>\n <td>UK</td>\n <td>0</td>\n <td>21929085</td>\n <td>RTSIL UK LTD W</td>\n <td>188823961</td>\n <td>WÄRTSILÄ UK LIMITED</td>\n <td>0</td>\n <td>UNKNOWN</td>\n <td>188823961</td>\n <td>WÄRTSILÄ UK LIMITED</td>\n <td>0</td>\n </tr>\n <tr>\n <th>366130</th>\n <td>88823972</td>\n <td>Xavier Erdödy</td>\n <td>Xavier Erdödy</td>\n <td>None</td>\n <td>GB</td>\n <td>UK</td>\n <td>0</td>\n <td>40578262</td>\n <td>XAVIER ERDÖDY</td>\n <td>188823972</td>\n <td>Xavier Erdödy</td>\n <td>0</td>\n <td>UNKNOWN</td>\n <td>188823972</td>\n <td>Xavier Erdödy</td>\n <td>0</td>\n </tr>\n <tr>\n <th>366135</th>\n <td>88823977</td>\n <td>Zeg.AI Ltd</td>\n <td>Zeg.AI Ltd</td>\n <td>None</td>\n <td>GB</td>\n <td>UK</td>\n <td>0</td>\n <td>37017676</td>\n <td>ZEG AI LTD</td>\n <td>188823977</td>\n <td>Zeg.AI Ltd</td>\n <td>0</td>\n <td>UNKNOWN</td>\n <td>188823977</td>\n <td>Zeg.AI Ltd</td>\n <td>0</td>\n </tr>\n <tr>\n <th>366137</th>\n <td>88823979</td>\n <td>Zhiyang Pan</td>\n <td>Zhiyang Pan</td>\n <td>None</td>\n <td>GB</td>\n <td>UK</td>\n <td>0</td>\n <td>17409767</td>\n <td>ZHIYANG PAN</td>\n <td>188823979</td>\n <td>Zhiyang Pan</td>\n <td>0</td>\n <td>UNKNOWN</td>\n <td>188823979</td>\n <td>Zhiyang Pan</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n<p>77303 rows × 16 columns</p>\n</div>"
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_206[df_206[\"person_ctry_code\"]==\"GB\"]"
],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}