{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import os\n",
    "import shutil\n",
    "from flashgeotext.geotext import GeoText\n",
    "import re"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "outputs": [],
   "source": [
    "import hashlib\n",
    "\n",
    "def md5hash(s: str):\n",
    "    return hashlib.md5(s.encode('utf-8')).hexdigest()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "record_col=\"UT (Unique WOS ID)\"\n",
    "outfile = r\"C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\WOS\\wos_extract\\wos_records_concat.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "      Publication Type                                            Authors   \n29758                C                           Fu, YC; Liu, YH; Gao, ZW  \\\n34098                J                  Han, D; Zhang, CH; Fauconnier, ML   \n55478                C      Xu, YX; Liu, M; Peng, L; Zhang, JQ; Zheng, YW   \n32260                C         Liu, Q; Cai, WD; Fu, ZJ; Shen, J; Linge, N   \n8751                 J  Shamshirband, S; Nodoushan, EJ; Adolf, JE; Man...   \n...                ...                                                ...   \n6151                 C    Seufert, M; Casas, P; Wehner, N; Gang, L; Li, K   \n32052                J  Huber, A; Kinna, D; Huber, V; Arnoux, G; Balbo...   \n27985                J      Dong, GP; Ma, J; Kwan, MP; Wang, YM; Chai, YW   \n2939                 J              Yin, ZY; Jin, YF; Huang, HW; Shen, SL   \n34651                J               Wang, JH; Lindenbergh, R; Menenti, M   \n\n      Book Authors                                       Book Editors   \n29758          NaN                                              Yu, H  \\\n34098          NaN                                                NaN   \n55478          NaN                                                NaN   \n32260          NaN        Fang, WC; Vasilakos, T; Stoica, A; Kwak, YS   \n8751           NaN                                                NaN   \n...            ...                                                ...   \n6151           NaN  Galis, A; Guillemin, F; Noldus, R; Secci, S; I...   \n32052          NaN                                                NaN   \n27985          NaN                                                NaN   \n2939           NaN                                                NaN   \n34651          NaN                                                NaN   \n\n      Book Group Authors                                  Author Full Names   \n29758                NaN            Fu, Yichuan; Liu, Yuanhong; Gao, Zhiwei  \\\n34098                NaN  Han, Dong; Zhang, Chun-Hui; Fauconnier, Marie-...   \n55478               IEEE  Xu, Yuxuan; Liu, Ming; Peng, Linning; Zhang, J...   \n32260                NaN  Liu, Qi; Cai, Weidong; Fu, Zhangjie; Shen, Jia...   \n8751                 NaN  Shamshirband, Shahaboddin; Nodoushan, Ehsan Ja...   \n...                  ...                                                ...   \n6151                 NaN  Seufert, Michael; Casas, Pedro; Wehner, Nikola...   \n32052                NaN  Huber, A.; Kinna, D.; Huber, V.; Arnoux, G.; B...   \n27985                NaN  Dong, Guanpeng; Ma, Jing; Kwan, Mei-Po; Wang, ...   \n2939                 NaN  Yin Zhen-Yu; Jin Yin-Fu; Huang Hong-Wei; Shen ...   \n34651                NaN  Wang, Jinhu; Lindenbergh, Roderik; Menenti, Ma...   \n\n      Book Author Full Names     Group Authors   \n29758                    NaN               NaN  \\\n34098                    NaN               NaN   \n55478                    NaN               NaN   \n32260                    NaN               NaN   \n8751                     NaN               NaN   \n...                      ...               ...   \n6151                     NaN               NaN   \n32052                    NaN  JET Contributors   \n27985                    NaN               NaN   \n2939                     NaN               NaN   \n34651                    NaN               NaN   \n\n                                           Article Title   \n29758  Multiple Actuator Fault Classification in Wind...  \\\n34098  Effect of Seasoning Addition on Volatile Compo...   \n55478  Colluding RF Fingerprint Impersonation Attack ...   \n32260  An Optimized Strategy for Speculative Executio...   \n8751   Ensemble models with uncertainty analysis for ...   \n...                                                  ...   \n6151   Stream-based Machine Learning for Real-time Qo...   \n32052  The near infrared imaging system for the real-...   \n27985  Multi-level temporal autoregressive modelling ...   \n2939   Evolutionary polynomial regression based model...   \n34651  SigVox - A 3D feature matching algorithm for a...   \n\n                                            Source Title  ...   \n29758  2019 25TH IEEE INTERNATIONAL CONFERENCE ON AUT...  ...  \\\n34098                                              FOODS  ...   \n55478  IEEE INTERNATIONAL CONFERENCE ON COMMUNICATION...  ...   \n32260  2015 9TH INTERNATIONAL CONFERENCE ON FUTURE GE...  ...   \n8751   ENGINEERING APPLICATIONS OF COMPUTATIONAL FLUI...  ...   \n...                                                  ...  ...   \n6151   PROCEEDINGS OF THE 2019 22ND CONFERENCE ON INN...  ...   \n32052                                    PHYSICA SCRIPTA  ...   \n27985  INTERNATIONAL JOURNAL OF GEOGRAPHICAL INFORMAT...  ...   \n2939                                 ENGINEERING GEOLOGY  ...   \n34651  ISPRS JOURNAL OF PHOTOGRAMMETRY AND REMOTE SEN...  ...   \n\n                                          WoS Categories   \n29758  Automation & Control Systems; Computer Science...  \\\n34098                          Food Science & Technology   \n55478                                 Telecommunications   \n32260          Computer Science, Hardware & Architecture   \n8751   Engineering, Multidisciplinary; Engineering, M...   \n...                                                  ...   \n6151   Computer Science, Hardware & Architecture; Com...   \n32052                         Physics, Multidisciplinary   \n27985  Computer Science, Information Systems; Geograp...   \n2939   Engineering, Geological; Geosciences, Multidis...   \n34651  Geography, Physical; Geosciences, Multidiscipl...   \n\n                                    Web of Science Index   \n29758  Conference Proceedings Citation Index - Scienc...  \\\n34098     Science Citation Index Expanded (SCI-EXPANDED)   \n55478  Conference Proceedings Citation Index - Scienc...   \n32260  Conference Proceedings Citation Index - Scienc...   \n8751      Science Citation Index Expanded (SCI-EXPANDED)   \n...                                                  ...   \n6151   Conference Proceedings Citation Index - Scienc...   \n32052  Science Citation Index Expanded (SCI-EXPANDED)...   \n27985  Science Citation Index Expanded (SCI-EXPANDED)...   \n2939      Science Citation Index Expanded (SCI-EXPANDED)   \n34651     Science Citation Index Expanded (SCI-EXPANDED)   \n\n                                          Research Areas IDS Number   \n29758     Automation & Control Systems; Computer Science      BP9AN  \\\n34098                          Food Science & Technology      PV8DT   \n55478                                 Telecommunications      BT9VG   \n32260                                   Computer Science      BF1GE   \n8751                              Engineering; Mechanics      HE2WU   \n...                                                  ...        ...   \n6151                                    Computer Science      BM8PP   \n32052                                            Physics      FL3JX   \n27985  Computer Science; Geography; Physical Geograph...      GS7LK   \n2939                                Engineering; Geology      DS2IG   \n34651  Physical Geography; Geology; Remote Sensing; I...      EX2BV   \n\n        Pubmed Id Open Access Designations Highly Cited Status   \n29758         NaN                      NaN                 NaN  \\\n34098  33406625.0    gold, Green Published                 NaN   \n55478         NaN                      NaN                 NaN   \n32260         NaN                      NaN                 NaN   \n8751          NaN    Green Published, gold                   Y   \n...           ...                      ...                 ...   \n6151          NaN                      NaN                 NaN   \n32052         NaN                      NaN                 NaN   \n27985         NaN  hybrid, Green Published                 NaN   \n2939          NaN                      NaN                 NaN   \n34651         NaN                      NaN                 NaN   \n\n      Hot Paper Status Date of Export   UT (Unique WOS ID)  \n29758              NaN     2023-04-28  WOS:000568623100060  \n34098              NaN     2023-04-28  WOS:000610212800001  \n55478              NaN     2023-04-28  WOS:000864709903078  \n32260              NaN     2023-04-28  WOS:000380393500003  \n8751                 N     2023-04-28  WOS:000453212200001  \n...                ...            ...                  ...  \n6151               NaN     2023-04-28  WOS:000469794500014  \n32052              NaN     2023-04-28  WOS:000414120500027  \n27985              NaN     2023-04-28  WOS:000443882300004  \n2939               NaN     2023-04-28  WOS:000380592100015  \n34651              NaN     2023-04-28  WOS:000403031400010  \n\n[100 rows x 71 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Publication Type</th>\n      <th>Authors</th>\n      <th>Book Authors</th>\n      <th>Book Editors</th>\n      <th>Book Group Authors</th>\n      <th>Author Full Names</th>\n      <th>Book Author Full Names</th>\n      <th>Group Authors</th>\n      <th>Article Title</th>\n      <th>Source Title</th>\n      <th>...</th>\n      <th>WoS Categories</th>\n      <th>Web of Science Index</th>\n      <th>Research Areas</th>\n      <th>IDS Number</th>\n      <th>Pubmed Id</th>\n      <th>Open Access Designations</th>\n      <th>Highly Cited Status</th>\n      <th>Hot Paper Status</th>\n      <th>Date of Export</th>\n      <th>UT (Unique WOS ID)</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>29758</th>\n      <td>C</td>\n      <td>Fu, YC; Liu, YH; Gao, ZW</td>\n      <td>NaN</td>\n      <td>Yu, H</td>\n      <td>NaN</td>\n      <td>Fu, Yichuan; Liu, Yuanhong; Gao, Zhiwei</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Multiple Actuator Fault Classification in Wind...</td>\n      <td>2019 25TH IEEE INTERNATIONAL CONFERENCE ON AUT...</td>\n      <td>...</td>\n      <td>Automation &amp; Control Systems; Computer Science...</td>\n      <td>Conference Proceedings Citation Index - Scienc...</td>\n      <td>Automation &amp; Control Systems; Computer Science</td>\n      <td>BP9AN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>2023-04-28</td>\n      <td>WOS:000568623100060</td>\n    </tr>\n    <tr>\n      <th>34098</th>\n      <td>J</td>\n      <td>Han, D; Zhang, CH; Fauconnier, ML</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Han, Dong; Zhang, Chun-Hui; Fauconnier, Marie-...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Effect of Seasoning Addition on Volatile Compo...</td>\n      <td>FOODS</td>\n      <td>...</td>\n      <td>Food Science &amp; Technology</td>\n      <td>Science Citation Index Expanded (SCI-EXPANDED)</td>\n      <td>Food Science &amp; Technology</td>\n      <td>PV8DT</td>\n      <td>33406625.0</td>\n      <td>gold, Green Published</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>2023-04-28</td>\n      <td>WOS:000610212800001</td>\n    </tr>\n    <tr>\n      <th>55478</th>\n      <td>C</td>\n      <td>Xu, YX; Liu, M; Peng, L; Zhang, JQ; Zheng, YW</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>IEEE</td>\n      <td>Xu, Yuxuan; Liu, Ming; Peng, Linning; Zhang, J...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Colluding RF Fingerprint Impersonation Attack ...</td>\n      <td>IEEE INTERNATIONAL CONFERENCE ON COMMUNICATION...</td>\n      <td>...</td>\n      <td>Telecommunications</td>\n      <td>Conference Proceedings Citation Index - Scienc...</td>\n      <td>Telecommunications</td>\n      <td>BT9VG</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>2023-04-28</td>\n      <td>WOS:000864709903078</td>\n    </tr>\n    <tr>\n      <th>32260</th>\n      <td>C</td>\n      <td>Liu, Q; Cai, WD; Fu, ZJ; Shen, J; Linge, N</td>\n      <td>NaN</td>\n      <td>Fang, WC; Vasilakos, T; Stoica, A; Kwak, YS</td>\n      <td>NaN</td>\n      <td>Liu, Qi; Cai, Weidong; Fu, Zhangjie; Shen, Jia...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>An Optimized Strategy for Speculative Executio...</td>\n      <td>2015 9TH INTERNATIONAL CONFERENCE ON FUTURE GE...</td>\n      <td>...</td>\n      <td>Computer Science, Hardware &amp; Architecture</td>\n      <td>Conference Proceedings Citation Index - Scienc...</td>\n      <td>Computer Science</td>\n      <td>BF1GE</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>2023-04-28</td>\n      <td>WOS:000380393500003</td>\n    </tr>\n    <tr>\n      <th>8751</th>\n      <td>J</td>\n      <td>Shamshirband, S; Nodoushan, EJ; Adolf, JE; Man...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Shamshirband, Shahaboddin; Nodoushan, Ehsan Ja...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Ensemble models with uncertainty analysis for ...</td>\n      <td>ENGINEERING APPLICATIONS OF COMPUTATIONAL FLUI...</td>\n      <td>...</td>\n      <td>Engineering, Multidisciplinary; Engineering, M...</td>\n      <td>Science Citation Index Expanded (SCI-EXPANDED)</td>\n      <td>Engineering; Mechanics</td>\n      <td>HE2WU</td>\n      <td>NaN</td>\n      <td>Green Published, gold</td>\n      <td>Y</td>\n      <td>N</td>\n      <td>2023-04-28</td>\n      <td>WOS:000453212200001</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>6151</th>\n      <td>C</td>\n      <td>Seufert, M; Casas, P; Wehner, N; Gang, L; Li, K</td>\n      <td>NaN</td>\n      <td>Galis, A; Guillemin, F; Noldus, R; Secci, S; I...</td>\n      <td>NaN</td>\n      <td>Seufert, Michael; Casas, Pedro; Wehner, Nikola...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Stream-based Machine Learning for Real-time Qo...</td>\n      <td>PROCEEDINGS OF THE 2019 22ND CONFERENCE ON INN...</td>\n      <td>...</td>\n      <td>Computer Science, Hardware &amp; Architecture; Com...</td>\n      <td>Conference Proceedings Citation Index - Scienc...</td>\n      <td>Computer Science</td>\n      <td>BM8PP</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>2023-04-28</td>\n      <td>WOS:000469794500014</td>\n    </tr>\n    <tr>\n      <th>32052</th>\n      <td>J</td>\n      <td>Huber, A; Kinna, D; Huber, V; Arnoux, G; Balbo...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Huber, A.; Kinna, D.; Huber, V.; Arnoux, G.; B...</td>\n      <td>NaN</td>\n      <td>JET Contributors</td>\n      <td>The near infrared imaging system for the real-...</td>\n      <td>PHYSICA SCRIPTA</td>\n      <td>...</td>\n      <td>Physics, Multidisciplinary</td>\n      <td>Science Citation Index Expanded (SCI-EXPANDED)...</td>\n      <td>Physics</td>\n      <td>FL3JX</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>2023-04-28</td>\n      <td>WOS:000414120500027</td>\n    </tr>\n    <tr>\n      <th>27985</th>\n      <td>J</td>\n      <td>Dong, GP; Ma, J; Kwan, MP; Wang, YM; Chai, YW</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Dong, Guanpeng; Ma, Jing; Kwan, Mei-Po; Wang, ...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Multi-level temporal autoregressive modelling ...</td>\n      <td>INTERNATIONAL JOURNAL OF GEOGRAPHICAL INFORMAT...</td>\n      <td>...</td>\n      <td>Computer Science, Information Systems; Geograp...</td>\n      <td>Science Citation Index Expanded (SCI-EXPANDED)...</td>\n      <td>Computer Science; Geography; Physical Geograph...</td>\n      <td>GS7LK</td>\n      <td>NaN</td>\n      <td>hybrid, Green Published</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>2023-04-28</td>\n      <td>WOS:000443882300004</td>\n    </tr>\n    <tr>\n      <th>2939</th>\n      <td>J</td>\n      <td>Yin, ZY; Jin, YF; Huang, HW; Shen, SL</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Yin Zhen-Yu; Jin Yin-Fu; Huang Hong-Wei; Shen ...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Evolutionary polynomial regression based model...</td>\n      <td>ENGINEERING GEOLOGY</td>\n      <td>...</td>\n      <td>Engineering, Geological; Geosciences, Multidis...</td>\n      <td>Science Citation Index Expanded (SCI-EXPANDED)</td>\n      <td>Engineering; Geology</td>\n      <td>DS2IG</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>2023-04-28</td>\n      <td>WOS:000380592100015</td>\n    </tr>\n    <tr>\n      <th>34651</th>\n      <td>J</td>\n      <td>Wang, JH; Lindenbergh, R; Menenti, M</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Wang, Jinhu; Lindenbergh, Roderik; Menenti, Ma...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>SigVox - A 3D feature matching algorithm for a...</td>\n      <td>ISPRS JOURNAL OF PHOTOGRAMMETRY AND REMOTE SEN...</td>\n      <td>...</td>\n      <td>Geography, Physical; Geosciences, Multidiscipl...</td>\n      <td>Science Citation Index Expanded (SCI-EXPANDED)</td>\n      <td>Physical Geography; Geology; Remote Sensing; I...</td>\n      <td>EX2BV</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>2023-04-28</td>\n      <td>WOS:000403031400010</td>\n    </tr>\n  </tbody>\n</table>\n<p>100 rows × 71 columns</p>\n</div>"
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wos = pd.read_csv(outfile, sep=\"\\t\",low_memory=False)\n",
    "wos.sample(100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of initial (valid interval) records: 56196\n"
     ]
    },
    {
     "data": {
      "text/plain": "     Domain_English                      Field_English   \n0  Applied Sciences  Agriculture, Fisheries & Forestry  \\\n1  Applied Sciences  Agriculture, Fisheries & Forestry   \n2  Applied Sciences  Agriculture, Fisheries & Forestry   \n3  Applied Sciences  Agriculture, Fisheries & Forestry   \n4  Applied Sciences  Agriculture, Fisheries & Forestry   \n\n         SubField_English 2.00 SEQ                      Source_title  srcid   \n0  Agronomy & Agriculture        1                 Annals of Biology  13016  \\\n1  Agronomy & Agriculture        1              Advances in Agronomy  14324   \n2  Agronomy & Agriculture        1  European Journal of Soil Biology  14648   \n3  Agronomy & Agriculture        1     Soil Biology and Biochemistry  14802   \n4  Agronomy & Agriculture        1               Agricultura Tecnica  14972   \n\n  issn_type      issn  \n0     issn1  09700153  \n1     issn1  00652113  \n2     issn1  11645563  \n3     issn1  00380717  \n4     issn1  03652807  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Domain_English</th>\n      <th>Field_English</th>\n      <th>SubField_English</th>\n      <th>2.00 SEQ</th>\n      <th>Source_title</th>\n      <th>srcid</th>\n      <th>issn_type</th>\n      <th>issn</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Applied Sciences</td>\n      <td>Agriculture, Fisheries &amp; Forestry</td>\n      <td>Agronomy &amp; Agriculture</td>\n      <td>1</td>\n      <td>Annals of Biology</td>\n      <td>13016</td>\n      <td>issn1</td>\n      <td>09700153</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Applied Sciences</td>\n      <td>Agriculture, Fisheries &amp; Forestry</td>\n      <td>Agronomy &amp; Agriculture</td>\n      <td>1</td>\n      <td>Advances in Agronomy</td>\n      <td>14324</td>\n      <td>issn1</td>\n      <td>00652113</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Applied Sciences</td>\n      <td>Agriculture, Fisheries &amp; Forestry</td>\n      <td>Agronomy &amp; Agriculture</td>\n      <td>1</td>\n      <td>European Journal of Soil Biology</td>\n      <td>14648</td>\n      <td>issn1</td>\n      <td>11645563</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Applied Sciences</td>\n      <td>Agriculture, Fisheries &amp; Forestry</td>\n      <td>Agronomy &amp; Agriculture</td>\n      <td>1</td>\n      <td>Soil Biology and Biochemistry</td>\n      <td>14802</td>\n      <td>issn1</td>\n      <td>00380717</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Applied Sciences</td>\n      <td>Agriculture, Fisheries &amp; Forestry</td>\n      <td>Agronomy &amp; Agriculture</td>\n      <td>1</td>\n      <td>Agricultura Tecnica</td>\n      <td>14972</td>\n      <td>issn1</td>\n      <td>03652807</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "wos = wos[((wos[\"Publication Year\"]<2023)&(wos[\"Publication Year\"]>2010))].copy()\n",
    "print(f'Number of initial (valid interval) records: {len(wos)}')\n",
    "\n",
    "metrix = pd.read_excel(\"sm_journal_classification.xlsx\", sheet_name=\"Journal_Classification\")\n",
    "\n",
    "\n",
    "metrix = metrix.set_index([c for c in metrix.columns if \"issn\" not in c]).stack().reset_index()\n",
    "metrix = metrix.rename(columns={'level_6':\"issn_type\", 0:\"issn\"})\n",
    "metrix[\"issn\"]=metrix[\"issn\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
    "metrix.head()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "outputs": [
    {
     "data": {
      "text/plain": "Domain_English        6\nField_English        21\nSubField_English    175\ndtype: int64"
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metrix[[\"Domain_English\",\"Field_English\",\"SubField_English\"]].nunique()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of initial (valid interval) records: 56196\n",
      "Number of METRIX filtered records: 49854\n",
      "Number of unindexed records: 2984\n",
      "Number of filtered records (dropping duplicates): 49839\n"
     ]
    }
   ],
   "source": [
    "\n",
    "wos[\"issn\"] = wos[\"ISSN\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
    "wos[\"eissn\"] = wos[\"eISSN\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
    "wos = wos.set_index([c for c in wos.columns if \"issn\" not in c]).stack().reset_index()\n",
    "wos = wos.rename(columns={'level_71':\"issn_var\", 0:\"issn\"})\n",
    "\n",
    "wos_merge = wos.merge(metrix, on=\"issn\", how=\"left\")\n",
    "\n",
    "\n",
    "\n",
    "wos_indexed = wos_merge[~wos_merge[\"Domain_English\"].isna()]\n",
    "wos_unindexed = wos_merge[~wos_merge[record_col].isin(wos_indexed[record_col])]\n",
    "\n",
    "\n",
    "wos_unindexed = wos_unindexed.sort_values(by=[\"issn_var\"],ascending=False).drop_duplicates(subset=record_col)\n",
    "wos = wos_indexed.sort_values(by=[\"issn_var\"],ascending=False).drop_duplicates(subset=record_col)\n",
    "\n",
    "wos_postmerge = wos.copy()\n",
    "print(f'Number of METRIX filtered records: {len(wos)}')\n",
    "print(f'Number of unindexed records: {len(wos_unindexed)}')\n",
    "\n",
    "# drop entries not indexed by metrix\n",
    "# drop duplicates (based on doi)\n",
    "wos = wos[~((~wos[\"DOI\"].isna())&(wos[\"DOI\"].duplicated(False)))]\n",
    "wos = wos.drop_duplicates(subset=[\"Publication Type\",\"Document Type\",\"Authors\",\"Article Title\",\"Source Title\",\"Publication Year\"])\n",
    "print(f'Number of filtered records (dropping duplicates): {len(wos)}')"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "outputs": [
    {
     "data": {
      "text/plain": "Domain_English\nApplied Sciences                31871\nNatural Sciences                 9542\nHealth Sciences                  5942\nEconomic & Social Sciences       1468\narticle-level classification      940\nArts & Humanities                  76\nName: count, dtype: int64"
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wos[\"Domain_English\"].value_counts()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "outputs": [],
   "source": [
    "wos_classifier = wos[[\"WoS Categories\",\"Research Areas\"]+list(metrix.columns)].copy().drop_duplicates()\n",
    "wos_classifier = wos_classifier.groupby([\"WoS Categories\",\"Research Areas\"], as_index=False)[[\"Domain_English\",\"Field_English\",\"SubField_English\"]].agg(\n",
    "    lambda x: pd.Series.mode(x)[0])"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found: 2065 \n",
      "Lost forever: 919\n"
     ]
    }
   ],
   "source": [
    "wos_to_reindex = wos_unindexed.drop(columns=list(metrix.columns))\n",
    "wos_found = wos_to_reindex.merge(wos_classifier, on=[\"WoS Categories\",\"Research Areas\"], how=\"inner\")\n",
    "# wos_found = wos_to_reindex.merge(wos_classifier, on=\"Research Areas\", how=\"inner\")\n",
    "# # wos_found = wos_to_reindex.merge(wos_classifier, on=\"WoS Categories\", how=\"inner\")\n",
    "wos_stillost = wos_unindexed[~wos_unindexed[record_col].isin(wos_found[record_col])]\n",
    "\n",
    "print(\"Found:\", wos_found[record_col].nunique(),\"\\nLost forever:\", wos_stillost[record_col].nunique())"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of records (after remerge): 51904\n"
     ]
    }
   ],
   "source": [
    "wos = pd.concat([wos,wos_found], ignore_index=True)\n",
    "print(f'Number of records (after remerge): {len(wos)}')"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "outputs": [
    {
     "data": {
      "text/plain": "Domain_English\nApplied Sciences                33720\nNatural Sciences                 9617\nHealth Sciences                  6002\nEconomic & Social Sciences       1533\narticle-level classification      955\nArts & Humanities                  77\nName: count, dtype: int64"
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wos[\"Domain_English\"].value_counts()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "outputs": [
    {
     "data": {
      "text/plain": "WoS Categories\nEngineering, Electrical & Electronic         13661\nComputer Science, Artificial Intelligence     7760\nComputer Science, Information Systems         6481\nTelecommunications                            5560\nComputer Science, Theory & Methods            3597\n                                             ...  \nMusic                                            1\nCultural Studies                                 1\nPsychology, Psychoanalysis                       1\nAsian Studies                                    1\nAndrology                                        1\nName: count, Length: 236, dtype: int64"
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wos_cat = wos.groupby(record_col)[\"WoS Categories\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
    "wos_cat[\"WoS Categories\"] = wos_cat[\"WoS Categories\"].str.strip()\n",
    "wos_cat[\"WoS Categories\"].value_counts()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "outputs": [
    {
     "data": {
      "text/plain": "WoS Category\nEngineering                                  20126\nComputer Science                             17613\nTelecommunications                            5560\nImaging Science & Photographic Technology     3295\nAutomation & Control Systems                  3232\n                                             ...  \nMusic                                            1\nAndrology                                        1\nLiterature                                       1\nCultural Studies                                 1\nAsian Studies                                    1\nName: count, Length: 177, dtype: int64"
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wos_subcat = wos_cat.copy()\n",
    "wos_subcat[['WoS Category', 'WoS SubCategory']] = wos_subcat[\"WoS Categories\"].str.split(\",\", expand = True, n=1)\n",
    "for c in ['WoS Category', 'WoS SubCategory',\"WoS Categories\"]:\n",
    "    wos_subcat[c] = wos_subcat[c].str.strip()\n",
    "wos_subcat.drop_duplicates(subset=[record_col,'WoS Category'])[\"WoS Category\"].value_counts()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "outputs": [
    {
     "data": {
      "text/plain": "Research Areas\nEngineering                                  20176\nComputer Science                             17613\nTelecommunications                            5560\nEnvironmental Sciences & Ecology              3732\nImaging Science & Photographic Technology     3295\n                                             ...  \nLiterature                                       1\nWomen's Studies                                  1\nCultural Studies                                 1\nAsian Studies                                    1\nMusic                                            1\nName: count, Length: 147, dtype: int64"
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wos_areas = wos.groupby(record_col)[\"Research Areas\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
    "wos_areas[\"Research Areas\"] = wos_areas[\"Research Areas\"].str.strip()\n",
    "wos_areas[\"Research Areas\"].value_counts()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "outputs": [
    {
     "data": {
      "text/plain": "                                           Article Title   \n24862  Kinematic self-calibration of non-contact five...  \\\n6623   Optimizing Color Assignment for Perception of ...   \n20728  CFD modeling of biomass combustion and gasific...   \n41245         Redshift-space distortions in f(R) gravity   \n12373  Executable Knowledge Graphs for Machine Learni...   \n...                                                  ...   \n11117  Biochar amendment mitigated N2O emissions from...   \n47975  Adaptive Noise Reduction for Sound Event Detec...   \n4599   NVM Storage in IoT Devices: Opportunities and ...   \n40609  FABNet: Fusion Attention Block and Transfer Le...   \n45199  Tea Category Identification Using a Novel Frac...   \n\n                                           Keywords Plus   \n24862            POSE MEASUREMENT; PARALLEL; MANIPULATOR  \\\n6623                            OPTIMIZATION; DIFFERENCE   \n20728  DISCRETE PARTICLE SIMULATION; STEAM GASIFICATI...   \n41245  DARK ENERGY SURVEY; REAL-SPACE; GROWTH; DENSIT...   \n12373                                                NaN   \n...                                                  ...   \n11117  NITROUS-OXIDE EMISSIONS; NITRIFIER DENITRIFICA...   \n47975  NONNEGATIVE MATRIX FACTORIZATION; SOURCE SEPAR...   \n4599   ENCRYPTED DATA; ACCESS-CONTROL; INTERNET; MEMO...   \n40609                                             NUCLEI   \n45199  LEARNING-BASED OPTIMIZATION; PATHOLOGICAL BRAI...   \n\n                                         Author Keywords  \n24862  kinematic self-calibration; five-axis measurin...  \n6623       Color perception; visual design; scatterplots  \n20728  Biomass combustion and gasification; CFD simul...  \n41245  cosmology: theory; dark energy; large-scale st...  \n12373  Knowledge graph; Machine learning; Data analyt...  \n...                                                  ...  \n11117  Biochar; Nitrite accumulation; Nitrous oxide; ...  \n47975  sound event detection; non-stationary noise; w...  \n4599   IoT; NVM; storage system; energy efficiency; s...  \n40609  Cancer; Analytical models; Transfer learning; ...  \n45199  tea-category identification; fractional Fourie...  \n\n[100 rows x 3 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Article Title</th>\n      <th>Keywords Plus</th>\n      <th>Author Keywords</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>24862</th>\n      <td>Kinematic self-calibration of non-contact five...</td>\n      <td>POSE MEASUREMENT; PARALLEL; MANIPULATOR</td>\n      <td>kinematic self-calibration; five-axis measurin...</td>\n    </tr>\n    <tr>\n      <th>6623</th>\n      <td>Optimizing Color Assignment for Perception of ...</td>\n      <td>OPTIMIZATION; DIFFERENCE</td>\n      <td>Color perception; visual design; scatterplots</td>\n    </tr>\n    <tr>\n      <th>20728</th>\n      <td>CFD modeling of biomass combustion and gasific...</td>\n      <td>DISCRETE PARTICLE SIMULATION; STEAM GASIFICATI...</td>\n      <td>Biomass combustion and gasification; CFD simul...</td>\n    </tr>\n    <tr>\n      <th>41245</th>\n      <td>Redshift-space distortions in f(R) gravity</td>\n      <td>DARK ENERGY SURVEY; REAL-SPACE; GROWTH; DENSIT...</td>\n      <td>cosmology: theory; dark energy; large-scale st...</td>\n    </tr>\n    <tr>\n      <th>12373</th>\n      <td>Executable Knowledge Graphs for Machine Learni...</td>\n      <td>NaN</td>\n      <td>Knowledge graph; Machine learning; Data analyt...</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>11117</th>\n      <td>Biochar amendment mitigated N2O emissions from...</td>\n      <td>NITROUS-OXIDE EMISSIONS; NITRIFIER DENITRIFICA...</td>\n      <td>Biochar; Nitrite accumulation; Nitrous oxide; ...</td>\n    </tr>\n    <tr>\n      <th>47975</th>\n      <td>Adaptive Noise Reduction for Sound Event Detec...</td>\n      <td>NONNEGATIVE MATRIX FACTORIZATION; SOURCE SEPAR...</td>\n      <td>sound event detection; non-stationary noise; w...</td>\n    </tr>\n    <tr>\n      <th>4599</th>\n      <td>NVM Storage in IoT Devices: Opportunities and ...</td>\n      <td>ENCRYPTED DATA; ACCESS-CONTROL; INTERNET; MEMO...</td>\n      <td>IoT; NVM; storage system; energy efficiency; s...</td>\n    </tr>\n    <tr>\n      <th>40609</th>\n      <td>FABNet: Fusion Attention Block and Transfer Le...</td>\n      <td>NUCLEI</td>\n      <td>Cancer; Analytical models; Transfer learning; ...</td>\n    </tr>\n    <tr>\n      <th>45199</th>\n      <td>Tea Category Identification Using a Novel Frac...</td>\n      <td>LEARNING-BASED OPTIMIZATION; PATHOLOGICAL BRAI...</td>\n      <td>tea-category identification; fractional Fourie...</td>\n    </tr>\n  </tbody>\n</table>\n<p>100 rows × 3 columns</p>\n</div>"
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wos[[\"Article Title\",\"Keywords Plus\",\"Author Keywords\"]].sample(100)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "outputs": [
    {
     "data": {
      "text/plain": "      UT (Unique WOS ID)                   keyword_all\n0    WOS:000208837000001               NANOINDENTATION\n1    WOS:000208837000001                      HARDNESS\n2    WOS:000208837000001        PLASMA-SPRAYED COATING\n3    WOS:000208837000001              INVERSE ANALYSIS\n4    WOS:000208837000001              NUMERICAL METHOD\n..                   ...                           ...\n97   WOS:000209571700012         PERSONALIZED MEDICINE\n98   WOS:000209571700012               COMPLEX NETWORK\n99   WOS:000209571700012    CLINICAL PHENOTYPE NETWORK\n100  WOS:000209571700012  TRADITIONAL CHINESE MEDICINE\n101  WOS:000209617200002                PHYLLOSCOPIDAE\n\n[100 rows x 2 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>UT (Unique WOS ID)</th>\n      <th>keyword_all</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>WOS:000208837000001</td>\n      <td>NANOINDENTATION</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>WOS:000208837000001</td>\n      <td>HARDNESS</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>WOS:000208837000001</td>\n      <td>PLASMA-SPRAYED COATING</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>WOS:000208837000001</td>\n      <td>INVERSE ANALYSIS</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>WOS:000208837000001</td>\n      <td>NUMERICAL METHOD</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>97</th>\n      <td>WOS:000209571700012</td>\n      <td>PERSONALIZED MEDICINE</td>\n    </tr>\n    <tr>\n      <th>98</th>\n      <td>WOS:000209571700012</td>\n      <td>COMPLEX NETWORK</td>\n    </tr>\n    <tr>\n      <th>99</th>\n      <td>WOS:000209571700012</td>\n      <td>CLINICAL PHENOTYPE NETWORK</td>\n    </tr>\n    <tr>\n      <th>100</th>\n      <td>WOS:000209571700012</td>\n      <td>TRADITIONAL CHINESE MEDICINE</td>\n    </tr>\n    <tr>\n      <th>101</th>\n      <td>WOS:000209617200002</td>\n      <td>PHYLLOSCOPIDAE</td>\n    </tr>\n  </tbody>\n</table>\n<p>100 rows × 2 columns</p>\n</div>"
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kw_df = pd.DataFrame()\n",
    "for c in [\"Keywords Plus\",\"Author Keywords\"]:\n",
    "    kwp = wos.groupby(record_col)[c].apply(lambda x: x.str.split(';')).explode().str.strip().str.upper()\n",
    "    kwp.name = 'keyword_all'\n",
    "    kw_df = pd.concat([kwp.reset_index(),kw_df],ignore_index=True)\n",
    "kw_df = kw_df[~kw_df[\"keyword_all\"].isna()].copy().drop(columns=\"level_1\").drop_duplicates()\n",
    "kw_df[\"keyword_all\"] = kw_df[\"keyword_all\"].apply(lambda x: re.sub(\"[\\(\\[].*?[\\)\\]]\", \"\", x))\n",
    "kw_df.head(100)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "outputs": [
    {
     "data": {
      "text/plain": "    UT (Unique WOS ID)                                        keyword_all\n0  WOS:000208837000001  NANOINDENTATION; HARDNESS; PLASMA-SPRAYED COAT...\n1  WOS:000208863600013  COMPARATIVE GENOMICS; ANAMMOX; KUENENIA STUTTG...\n2  WOS:000208863600266  ANME; PYROSEQUENCING; AOM; COMMUNITY STRUCTURE...\n3  WOS:000208863900217  DEFAULT MODE NETWORK; EFFECTIVE CONNECTIVITY; ...\n4  WOS:000208935500007  ESTIMATION; SOURCE TERM; QUASI STEADY; THE ITE...",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>UT (Unique WOS ID)</th>\n      <th>keyword_all</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>WOS:000208837000001</td>\n      <td>NANOINDENTATION; HARDNESS; PLASMA-SPRAYED COAT...</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>WOS:000208863600013</td>\n      <td>COMPARATIVE GENOMICS; ANAMMOX; KUENENIA STUTTG...</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>WOS:000208863600266</td>\n      <td>ANME; PYROSEQUENCING; AOM; COMMUNITY STRUCTURE...</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>WOS:000208863900217</td>\n      <td>DEFAULT MODE NETWORK; EFFECTIVE CONNECTIVITY; ...</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>WOS:000208935500007</td>\n      <td>ESTIMATION; SOURCE TERM; QUASI STEADY; THE ITE...</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wos_kwd_concat = kw_df.groupby(record_col, as_index=False).agg({'keyword_all': '; '.join})\n",
    "wos_kwd_concat.head()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "outputs": [
    {
     "data": {
      "text/plain": "Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',\n       'Book Group Authors', 'Author Full Names', 'Book Author Full Names',\n       'Group Authors', 'Article Title', 'Source Title', 'Book Series Title',\n       'Book Series Subtitle', 'Language', 'Document Type', 'Conference Title',\n       'Conference Date', 'Conference Location', 'Conference Sponsor',\n       'Conference Host', 'Author Keywords', 'Keywords Plus', 'Abstract',\n       'Addresses', 'Affiliations', 'Reprint Addresses', 'Email Addresses',\n       'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Name Preferred',\n       'Funding Text', 'Cited References', 'Cited Reference Count',\n       'Times Cited, WoS Core', 'Times Cited, All Databases',\n       '180 Day Usage Count', 'Since 2013 Usage Count', 'Publisher',\n       'Publisher City', 'Publisher Address', 'ISSN', 'eISSN', 'ISBN',\n       'Journal Abbreviation', 'Journal ISO Abbreviation', 'Publication Date',\n       'Publication Year', 'Volume', 'Issue', 'Part Number', 'Supplement',\n       'Special Issue', 'Meeting Abstract', 'Start Page', 'End Page',\n       'Article Number', 'DOI', 'DOI Link', 'Book DOI', 'Early Access Date',\n       'Number of Pages', 'WoS Categories', 'Web of Science Index',\n       'Research Areas', 'IDS Number', 'Pubmed Id', 'Open Access Designations',\n       'Highly Cited Status', 'Hot Paper Status', 'Date of Export',\n       'UT (Unique WOS ID)', 'issn_var', 'issn', 'Domain_English',\n       'Field_English', 'SubField_English', '2.00 SEQ', 'Source_title',\n       'srcid', 'issn_type'],\n      dtype='object')"
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wos.columns"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "geotext = GeoText()\n",
    "\n",
    "def extract_location(input_text, key='countries'):\n",
    "    anomalies = {\"Malta\":\"Malta\",\n",
    "                 \"Mongolia\":\"Mongolia\",\n",
    "                 \"Quatar\":\"Qatar\",\n",
    "                 \"Qatar\":\"Qatar\",\n",
    "                 \"Ethiop\":\"Ethiopia\",\n",
    "                 \"Nigeria\":\"Nigeria\",\n",
    "                 \"BELAR\":\"Belarus\",\n",
    "                 \"Venezuela\":\"Venezuela\",\n",
    "                 \"Cyprus\":\"Cyprus\",\n",
    "                 \"Ecuador\":\"Ecuador\",\n",
    "                 \"U Arab\":\"United Arab Emirates\",\n",
    "                 \"Syria\":\"Syria\",\n",
    "                 \"Uganda\":\"Uganda\",\n",
    "                 \"Yemen\":\"Yemen\",\n",
    "                 \"Mali\":\"Mali\",\n",
    "                 \"Senegal\":\"Senegal\",\n",
    "                 \"Vatican\":\"Vatican\",\n",
    "                 \"Uruguay\":\"Uruguay\",\n",
    "                 \"Panama\":\"Panama\",\n",
    "                 \"Fiji\":\"Fiji\",\n",
    "                 \"Faroe\":\"Faroe Islands\",\n",
    "                 \"Macedonia\":\"Macedonia\",\n",
    "                 'Mozambique':'Mozambique',\n",
    "                 \"Kuwait\":\"Kuwait\",\n",
    "                 \"Libya\":\"Libya\",\n",
    "                 \"Turkiy\":\"Turkey\",\n",
    "                 \"Liberia\":\"Liberia\",\n",
    "                 \"Namibia\":\"Namibia\",\n",
    "                 \"Ivoire\":\"Ivory Coast\",\n",
    "                 \"Guatemala\":\"Gutemala\",\n",
    "                 \"Paraguay\":\"Paraguay\",\n",
    "                 \"Honduras\":\"Honduras\",\n",
    "                 \"Nicaragua\":\"Nicaragua\",\n",
    "                 \"Trinidad\":\"Trinidad & Tobago\",\n",
    "                 \"Liechtenstein\":\"Liechtenstein\",\n",
    "                 \"Greenland\":\"Denmark\"}\n",
    "\n",
    "    extracted = geotext.extract(input_text=input_text)\n",
    "    found = extracted[key].keys()\n",
    "    if len(sorted(found))>0:\n",
    "        return sorted(found)[0]\n",
    "    elif key=='countries':\n",
    "        for i  in ['Scotland','Wales','England', 'N Ireland']:\n",
    "            if i in input_text:\n",
    "                return 'United Kingdom'\n",
    "        for j in anomalies.keys():\n",
    "            if j in input_text:\n",
    "                return anomalies.get(j)\n",
    "    else:\n",
    "        return None\n",
    "\n",
    "with open('../eu_members.txt',\"r\") as f:\n",
    "    eu_countries=f.readline().split(\",\")\n",
    "    eu_countries=[i.strip() for i in eu_countries]\n",
    "\n",
    "def country_cleanup(country):\n",
    "    if \"USA\" in country:\n",
    "        return \"USA\"\n",
    "    elif \"China\" in country:\n",
    "        return \"China\"\n",
    "    elif country in [\"England\", \"Northern Ireland\", \"Wales\", \"Scotland\",\"N Ireland\"]:\n",
    "        return \"United Kingdom\"\n",
    "    else:\n",
    "        return country\n",
    "\n",
    "\n",
    "def country_type(country):\n",
    "    if country in eu_countries:\n",
    "        return \"EU\"\n",
    "    elif country==\"China\":\n",
    "        return \"China\"\n",
    "    elif country in [\"Switzerland\", 'Norway','United Kingdom']:\n",
    "        return \"Non-EU associate\"\n",
    "    else:\n",
    "        return \"Other\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "locations = wos.groupby(record_col)[\"Addresses\"].apply(lambda x: x.str.split('[')).explode().reset_index().drop(columns=\"level_1\")\n",
    "\n",
    "\n",
    "locations = locations[locations[\"Addresses\"]!=\"\"].copy()\n",
    "locations[\"Address\"] = locations[\"Addresses\"].apply(lambda x:x.split(\"]\")[-1])\n",
    "locations[\"Authors_of_address\"] = locations[\"Addresses\"].apply(lambda x:x.split(\"]\")[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "outputs": [
    {
     "data": {
      "text/plain": "312820"
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(locations)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "outputs": [
    {
     "data": {
      "text/plain": "     UT (Unique WOS ID)                                 Authors_of_address   \n0   WOS:000208837000001                                Gitzhofer, Francois  \\\n1   WOS:000208837000001  Guo, Wei-Chao; Papeleux, Luc; Ponthot, Jean-Ph...   \n2   WOS:000208837000001                     Guo, Wei-Chao; Zhang, Wei-Hong   \n3   WOS:000208837000001                                       Rauchs, Gast   \n4   WOS:000208863600013                                         Hu, Baolan   \n..                  ...                                                ...   \n95  WOS:000209546000001                                  Salahuddin, Nawal   \n96  WOS:000209546000001                                Shrestha, Babu Raja   \n97  WOS:000209546000001                                   Tan, Cheng Cheng   \n98  WOS:000209546000001                                     Tang, Yao-Qing   \n99  WOS:000209546000001                                       Tu, Mei-Lien   \n\n                                              Address  \n0   Univ Sherbrooke, Dept Chem Engn, Plasma Techno...  \n1   Univ Liege, Aerosp & Mech Engn Dept, LTAS MN2L...  \n2   Northwestern Polytech Univ, Key Lab Contempora...  \n3   Ctr Rech Publ Henri Tudor, Dept Adv Mat & Stru...  \n4   Zhejiang Univ, Dept Environm Engn, Hangzhou 31...  \n..                                                ...  \n95  Aga Khan Univ & Hosp, Dept Med, Pulm & Crit Ca...  \n96  Kathmandu Med Coll Teaching Hosp, Dept Anesthe...  \n97  Sultanah Aminah Hosp, Dept Anaesthesia & Inten...  \n98  Shanghai Jiao Tong Univ, Sch Med, Ruijin Hosp,...  \n99  Chang Gung Mem Hosp, Kaohsiung Med Ctr, Dept R...  \n\n[100 rows x 3 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>UT (Unique WOS ID)</th>\n      <th>Authors_of_address</th>\n      <th>Address</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>WOS:000208837000001</td>\n      <td>Gitzhofer, Francois</td>\n      <td>Univ Sherbrooke, Dept Chem Engn, Plasma Techno...</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>WOS:000208837000001</td>\n      <td>Guo, Wei-Chao; Papeleux, Luc; Ponthot, Jean-Ph...</td>\n      <td>Univ Liege, Aerosp &amp; Mech Engn Dept, LTAS MN2L...</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>WOS:000208837000001</td>\n      <td>Guo, Wei-Chao; Zhang, Wei-Hong</td>\n      <td>Northwestern Polytech Univ, Key Lab Contempora...</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>WOS:000208837000001</td>\n      <td>Rauchs, Gast</td>\n      <td>Ctr Rech Publ Henri Tudor, Dept Adv Mat &amp; Stru...</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>WOS:000208863600013</td>\n      <td>Hu, Baolan</td>\n      <td>Zhejiang Univ, Dept Environm Engn, Hangzhou 31...</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>95</th>\n      <td>WOS:000209546000001</td>\n      <td>Salahuddin, Nawal</td>\n      <td>Aga Khan Univ &amp; Hosp, Dept Med, Pulm &amp; Crit Ca...</td>\n    </tr>\n    <tr>\n      <th>96</th>\n      <td>WOS:000209546000001</td>\n      <td>Shrestha, Babu Raja</td>\n      <td>Kathmandu Med Coll Teaching Hosp, Dept Anesthe...</td>\n    </tr>\n    <tr>\n      <th>97</th>\n      <td>WOS:000209546000001</td>\n      <td>Tan, Cheng Cheng</td>\n      <td>Sultanah Aminah Hosp, Dept Anaesthesia &amp; Inten...</td>\n    </tr>\n    <tr>\n      <th>98</th>\n      <td>WOS:000209546000001</td>\n      <td>Tang, Yao-Qing</td>\n      <td>Shanghai Jiao Tong Univ, Sch Med, Ruijin Hosp,...</td>\n    </tr>\n    <tr>\n      <th>99</th>\n      <td>WOS:000209546000001</td>\n      <td>Tu, Mei-Lien</td>\n      <td>Chang Gung Mem Hosp, Kaohsiung Med Ctr, Dept R...</td>\n    </tr>\n  </tbody>\n</table>\n<p>100 rows × 3 columns</p>\n</div>"
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "locations[\"Address\"] = locations[\"Address\"].str.strip().str.strip(\";\")\n",
    "locations = locations.groupby([record_col,\"Authors_of_address\"])[\"Address\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_2\")\n",
    "locations.head(100)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "outputs": [],
   "source": [
    "# import dask.dataframe as dd\n",
    "#\n",
    "# locations_ddf = dd.from_pandas(locations, npartitions=4)  # convert pandas DataFrame to Dask DataFrame\n",
    "# loc_compute = locations_ddf.groupby([record_col,\"Authors_of_address\"])[\"Address\"].apply(lambda x: x.str.split(';')).explode().compute()  # compute the result"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "outputs": [],
   "source": [
    "# locations_test = locations.head(1000)\n",
    "# locations_test = locations_test.groupby([record_col,\"Authors_of_address\"])[\"Address\"].str.split(';').explode()\n",
    "# locations_test"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "outputs": [],
   "source": [
    "\n",
    "# locations[\"Country\"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='countries'))\n",
    "locations[\"Country\"]=locations['Address'].apply(lambda x: x.split(\",\")[-1].strip(\" \").strip(\";\").strip(\" \"))\n",
    "locations[\"Country\"]=locations['Country'].apply(lambda x: country_cleanup(x))\n",
    "locations[\"City\"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='cities'))\n",
    "locations[\"Country_Type\"] = locations[\"Country\"].apply(lambda x: country_type(x))"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "outputs": [],
   "source": [
    "scope_types = [\"EU\",\"China\",\"Non-EU associate\"]\n",
    "locations=locations[locations[\"Country_Type\"].isin(scope_types)]"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "    UT (Unique WOS ID)                                            Address   \n1  WOS:000208837000001  Univ Liege, Aerosp & Mech Engn Dept, LTAS MN2L...  \\\n2  WOS:000208837000001  Northwestern Polytech Univ, Key Lab Contempora...   \n3  WOS:000208837000001  Ctr Rech Publ Henri Tudor, Dept Adv Mat & Stru...   \n4  WOS:000208863600013  Zhejiang Univ, Dept Environm Engn, Hangzhou 31...   \n5  WOS:000208863600013  Delft Univ Technol, Dept Biotechnol, Delft, Ne...   \n\n       Country        City Country_Type                 Institution  \n1      Belgium       Liège           EU                  Univ Liege  \n2        China       Xi’an        China  Northwestern Polytech Univ  \n3   Luxembourg  Luxembourg           EU   Ctr Rech Publ Henri Tudor  \n4        China    Hangzhou        China               Zhejiang Univ  \n5  Netherlands       Delft           EU          Delft Univ Technol  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>UT (Unique WOS ID)</th>\n      <th>Address</th>\n      <th>Country</th>\n      <th>City</th>\n      <th>Country_Type</th>\n      <th>Institution</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>1</th>\n      <td>WOS:000208837000001</td>\n      <td>Univ Liege, Aerosp &amp; Mech Engn Dept, LTAS MN2L...</td>\n      <td>Belgium</td>\n      <td>Liège</td>\n      <td>EU</td>\n      <td>Univ Liege</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>WOS:000208837000001</td>\n      <td>Northwestern Polytech Univ, Key Lab Contempora...</td>\n      <td>China</td>\n      <td>Xi’an</td>\n      <td>China</td>\n      <td>Northwestern Polytech Univ</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>WOS:000208837000001</td>\n      <td>Ctr Rech Publ Henri Tudor, Dept Adv Mat &amp; Stru...</td>\n      <td>Luxembourg</td>\n      <td>Luxembourg</td>\n      <td>EU</td>\n      <td>Ctr Rech Publ Henri Tudor</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>WOS:000208863600013</td>\n      <td>Zhejiang Univ, Dept Environm Engn, Hangzhou 31...</td>\n      <td>China</td>\n      <td>Hangzhou</td>\n      <td>China</td>\n      <td>Zhejiang Univ</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>WOS:000208863600013</td>\n      <td>Delft Univ Technol, Dept Biotechnol, Delft, Ne...</td>\n      <td>Netherlands</td>\n      <td>Delft</td>\n      <td>EU</td>\n      <td>Delft Univ Technol</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "univ_locations = locations[[record_col,\"Address\",\"Country\",\"City\",\"Country_Type\"]].copy()\n",
    "univ_locations[\"Institution\"] = univ_locations[\"Address\"].apply(lambda x: x.split(\",\")[0])\n",
    "univ_locations = univ_locations.drop_duplicates()\n",
    "univ_locations.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "    UT (Unique WOS ID)  Country Country_Type                     author_str_id\n0  WOS:000208837000001  Belgium           EU  6079964a4094c607358a130e41e89f90\n1  WOS:000208837000001  Belgium           EU  2321037fa90ac94a23b88a79f1c7f454\n2  WOS:000208837000001  Belgium           EU  8a1bfa1e7bc52d323f0d9c23a9b74ed3\n3  WOS:000208837000001    China        China  6079964a4094c607358a130e41e89f90\n4  WOS:000208837000001    China        China  17fb036de6a4db3ba39ccab3d8307c04",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>UT (Unique WOS ID)</th>\n      <th>Country</th>\n      <th>Country_Type</th>\n      <th>author_str_id</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>WOS:000208837000001</td>\n      <td>Belgium</td>\n      <td>EU</td>\n      <td>6079964a4094c607358a130e41e89f90</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>WOS:000208837000001</td>\n      <td>Belgium</td>\n      <td>EU</td>\n      <td>2321037fa90ac94a23b88a79f1c7f454</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>WOS:000208837000001</td>\n      <td>Belgium</td>\n      <td>EU</td>\n      <td>8a1bfa1e7bc52d323f0d9c23a9b74ed3</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>WOS:000208837000001</td>\n      <td>China</td>\n      <td>China</td>\n      <td>6079964a4094c607358a130e41e89f90</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>WOS:000208837000001</td>\n      <td>China</td>\n      <td>China</td>\n      <td>17fb036de6a4db3ba39ccab3d8307c04</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "author_locations = locations.groupby([record_col,\"Country\",\"Country_Type\"])[\"Authors_of_address\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_3\")\n",
    "author_locations[\"Author_name\"] = author_locations[\"Authors_of_address\"].str.strip()\n",
    "author_locations = author_locations.drop(columns=\"Authors_of_address\")\n",
    "author_locations[\"author_str_id\"] = author_locations[\"Author_name\"].apply(lambda x:''.join(filter(str.isalnum, x.lower())))\n",
    "author_locations[\"author_str_id\"] = author_locations[\"author_str_id\"].apply(md5hash)\n",
    "author_locations = author_locations.drop(columns=\"Author_name\")\n",
    "author_locations.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "outputs": [
    {
     "data": {
      "text/plain": "         UT (Unique WOS ID)      Country      Country_Type   \n0       WOS:000208837000001      Belgium                EU  \\\n3       WOS:000208837000001        China             China   \n4       WOS:000208837000001        China             China   \n6       WOS:000208863600013        China             China   \n7       WOS:000208863600013  Netherlands                EU   \n...                     ...          ...               ...   \n643323  WOS:000964683900016        Italy                EU   \n643324  WOS:000964683900016        Italy                EU   \n643325  WOS:000967389100001        China             China   \n643326  WOS:000967389100001       Norway  Non-EU associate   \n643327  WOS:000967389100001       Norway  Non-EU associate   \n\n                           author_str_id  \n0       6079964a4094c607358a130e41e89f90  \n3       6079964a4094c607358a130e41e89f90  \n4       17fb036de6a4db3ba39ccab3d8307c04  \n6       54c7bc6fe9b77434ca1bf04d763d843b  \n7       df81f9da6c8f5c968c16ef0aab1bb8f9  \n...                                  ...  \n643323  3c631398a81ab7058d95a0c6418a2c0b  \n643324  3c631398a81ab7058d95a0c6418a2c0b  \n643325  ce65541a6c334225a9617439f4a95012  \n643326  7c52a53f8d79b1ffd4f2e4cde9548e1d  \n643327  7c52a53f8d79b1ffd4f2e4cde9548e1d  \n\n[573569 rows x 4 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>UT (Unique WOS ID)</th>\n      <th>Country</th>\n      <th>Country_Type</th>\n      <th>author_str_id</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>WOS:000208837000001</td>\n      <td>Belgium</td>\n      <td>EU</td>\n      <td>6079964a4094c607358a130e41e89f90</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>WOS:000208837000001</td>\n      <td>China</td>\n      <td>China</td>\n      <td>6079964a4094c607358a130e41e89f90</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>WOS:000208837000001</td>\n      <td>China</td>\n      <td>China</td>\n      <td>17fb036de6a4db3ba39ccab3d8307c04</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>WOS:000208863600013</td>\n      <td>China</td>\n      <td>China</td>\n      <td>54c7bc6fe9b77434ca1bf04d763d843b</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>WOS:000208863600013</td>\n      <td>Netherlands</td>\n      <td>EU</td>\n      <td>df81f9da6c8f5c968c16ef0aab1bb8f9</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>643323</th>\n      <td>WOS:000964683900016</td>\n      <td>Italy</td>\n      <td>EU</td>\n      <td>3c631398a81ab7058d95a0c6418a2c0b</td>\n    </tr>\n    <tr>\n      <th>643324</th>\n      <td>WOS:000964683900016</td>\n      <td>Italy</td>\n      <td>EU</td>\n      <td>3c631398a81ab7058d95a0c6418a2c0b</td>\n    </tr>\n    <tr>\n      <th>643325</th>\n      <td>WOS:000967389100001</td>\n      <td>China</td>\n      <td>China</td>\n      <td>ce65541a6c334225a9617439f4a95012</td>\n    </tr>\n    <tr>\n      <th>643326</th>\n      <td>WOS:000967389100001</td>\n      <td>Norway</td>\n      <td>Non-EU associate</td>\n      <td>7c52a53f8d79b1ffd4f2e4cde9548e1d</td>\n    </tr>\n    <tr>\n      <th>643327</th>\n      <td>WOS:000967389100001</td>\n      <td>Norway</td>\n      <td>Non-EU associate</td>\n      <td>7c52a53f8d79b1ffd4f2e4cde9548e1d</td>\n    </tr>\n  </tbody>\n</table>\n<p>573569 rows × 4 columns</p>\n</div>"
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "author_locations[author_locations['author_str_id'].duplicated(False)]"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "author_primary_region = author_locations.sort_values(by=\"Country_Type\").drop_duplicates(subset=[record_col,\"author_str_id\"])\n",
    "# author_primary_region\n",
    "\n",
    "china=author_primary_region[author_primary_region[\"Country_Type\"]==\"China\"][record_col].unique()\n",
    "eu=author_primary_region[author_primary_region[\"Country_Type\"]==\"EU\"][record_col].unique()\n",
    "assoc=author_primary_region[author_primary_region[\"Country_Type\"]==\"Non-EU associate\"][record_col].unique()\n",
    "\n",
    "\n",
    "# records that have distinct authors with different country affiliations\n",
    "valid_scope = wos[((wos[record_col].isin(china))\n",
    "         &\n",
    "         ((wos[record_col].isin(eu))\n",
    "         |\n",
    "         (wos[record_col].isin(assoc))))][record_col].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "outputs": [
    {
     "data": {
      "text/plain": "         UT (Unique WOS ID) Country Country_Type   \n537692  WOS:000732204600001   China        China  \\\n204027  WOS:000414089800001   China        China   \n204028  WOS:000414089800001   China        China   \n204029  WOS:000414089800001   China        China   \n204030  WOS:000414090800001   China        China   \n\n                           author_str_id  \n537692  8fe31cbbd07c639aa4d779688896be81  \n204027  67c7beb18fafd77f1319739fa683bc5e  \n204028  7269f0a31fc620688aae12aad9e3cd85  \n204029  ac28aea698a527fb5195d3d24189ea04  \n204030  6c91bf481b6bddc1426d12a18823224a  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>UT (Unique WOS ID)</th>\n      <th>Country</th>\n      <th>Country_Type</th>\n      <th>author_str_id</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>537692</th>\n      <td>WOS:000732204600001</td>\n      <td>China</td>\n      <td>China</td>\n      <td>8fe31cbbd07c639aa4d779688896be81</td>\n    </tr>\n    <tr>\n      <th>204027</th>\n      <td>WOS:000414089800001</td>\n      <td>China</td>\n      <td>China</td>\n      <td>67c7beb18fafd77f1319739fa683bc5e</td>\n    </tr>\n    <tr>\n      <th>204028</th>\n      <td>WOS:000414089800001</td>\n      <td>China</td>\n      <td>China</td>\n      <td>7269f0a31fc620688aae12aad9e3cd85</td>\n    </tr>\n    <tr>\n      <th>204029</th>\n      <td>WOS:000414089800001</td>\n      <td>China</td>\n      <td>China</td>\n      <td>ac28aea698a527fb5195d3d24189ea04</td>\n    </tr>\n    <tr>\n      <th>204030</th>\n      <td>WOS:000414090800001</td>\n      <td>China</td>\n      <td>China</td>\n      <td>6c91bf481b6bddc1426d12a18823224a</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "author_primary_region.head()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of records: 51904\n",
      "Number of valid cooperation records: 46060\n"
     ]
    }
   ],
   "source": [
    "print(f'Number of records: {len(wos)}')\n",
    "print(f'Number of valid cooperation records: {len(valid_scope)}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "outputs": [],
   "source": [
    "wos = wos[wos[record_col].isin(valid_scope)]\n",
    "locations = locations[locations[record_col].isin(valid_scope)]\n",
    "univ_locations = univ_locations[univ_locations[record_col].isin(valid_scope)]\n",
    "author_locations = author_locations[author_locations[record_col].isin(valid_scope)]\n",
    "author_primary_region = author_locations[author_locations[record_col].isin(valid_scope)]"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "affiliations = wos.groupby(record_col)[\"Affiliations\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
    "affiliations[\"Affiliations\"] = affiliations[\"Affiliations\"].str.strip().str.upper().fillna(\"UNKNOWN\")\n",
    "affiliations = affiliations.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "outputs": [
    {
     "data": {
      "text/plain": "Affiliations\nCHINESE ACADEMY OF SCIENCES                                                       5616\nUNIVERSITY OF LONDON                                                              2604\nUDICE-FRENCH RESEARCH UNIVERSITIES                                                2240\nCENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE (CNRS)                               2170\nTSINGHUA UNIVERSITY                                                               1935\n                                                                                  ... \nUNIVERSITY OF FUKUI                                                                  1\nPONTIFICIA UNIVERSIDADE CATOLICA DE GOIAS                                            1\nINSTITUTE OF ORGANIC CHEMISTRY & BIOCHEMISTRY OF THE CZECH ACADEMY OF SCIENCES       1\nUNIVERSITAS PELITA HARAPAN                                                           1\nFRANCISCUS GASTHUIS                                                                  1\nName: count, Length: 7609, dtype: int64"
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "affiliations[\"Affiliations\"].value_counts()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "outputs": [
    {
     "data": {
      "text/plain": "Institution\nChinese Acad Sci                                 5749\nTsinghua Univ                                    2315\nShanghai Jiao Tong Univ                          1976\nZhejiang Univ                                    1806\nPeking Univ                                      1661\n                                                 ... \nNatl Technol Inst Mental Disorders                  1\nSeinajoki Univ Appl Sci                             1\nJD Intelligent City Res                             1\nCAS Ctr Excellence Planetol                         1\nKey Lab Intelligent Prevent Med Zhejiang Prov       1\nName: count, Length: 19821, dtype: int64"
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "univ_locations[\"Institution\"].value_counts()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "outputs": [
    {
     "data": {
      "text/plain": "46060"
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "univ_locations[record_col].nunique()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "outputs": [
    {
     "data": {
      "text/plain": "46060"
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "affiliations[record_col].nunique()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "outputs": [
    {
     "data": {
      "text/plain": "202790"
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "univ_locations[\"Institution\"].value_counts().sum()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "outputs": [
    {
     "data": {
      "text/plain": "268471"
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "affiliations[\"Affiliations\"].value_counts().sum()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "WoS Categories\n Engineering, Electrical & Electronic        8303\nComputer Science, Artificial Intelligence    6115\n Telecommunications                          4661\nComputer Science, Information Systems        4584\nEngineering, Electrical & Electronic         4036\n                                             ... \nCultural Studies                                1\n Ornithology                                    1\n Criminology & Penology                         1\nArt                                             1\n Psychology, Developmental                      1\nName: count, Length: 425, dtype: int64"
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wos_cat = wos.groupby(record_col)[\"WoS Categories\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
    "wos_cat[\"WoS Categories\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "Research Areas\nEngineering                                  18098\nComputer Science                             15658\nTelecommunications                            5046\nEnvironmental Sciences & Ecology              3246\nImaging Science & Photographic Technology     2947\n                                             ...  \nFilm, Radio & Television                         2\nArea Studies                                     2\nCultural Studies                                 1\nAsian Studies                                    1\nMusic                                            1\nName: count, Length: 145, dtype: int64"
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wos_areas = wos.groupby(record_col)[\"Research Areas\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
    "wos_areas[\"Research Areas\"] = wos_areas[\"Research Areas\"].str.strip()\n",
    "wos_areas[\"Research Areas\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "['Domain_English', 'Field_English', 'SubField_English']"
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[c for c in wos.columns if \"_English\" in c]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "metrix_levels = [c for c in wos.columns if \"_English\" in c]\n",
    "for m in metrix_levels:\n",
    "    wos[m] = wos[m].replace({\"article-level classification\":\"Multidisciplinary\"})\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "      Publication Type                                            Authors   \n0                    J                       Yan, Z; Jing, XY; Pedrycz, W  \\\n1                    J  Sookhak, M; Yu, FR; He, Y; Talebian, H; Safa, ...   \n2                    J  Ning, ZL; Dong, PR; Wang, XJ; Guo, L; Rodrigue...   \n3                    J  Wang, XD; Garg, S; Lin, H; Kaddoum, G; Hu, J; ...   \n4                    J  Lu, TG; Chen, XY; McElroy, MB; Nielsen, CP; Wu...   \n...                ...                                                ...   \n51897                J  Lai, ZL; Liu, W; Jian, XD; Bacsa, K; Sun, LM; ...   \n51898                J                     Wang, HC; Roussel, P; Denby, B   \n51899                J       Zhang, R; Alpdogan, S; Kong, SQ; Muhammad, S   \n51902                J                                   Chu, WP; Song, Y   \n51903                J  Lai, CS; Jia, YW; Dong, ZK; Wang, DX; Tao, YS;...   \n\n      Book Authors Book Editors Book Group Authors   \n0              NaN          NaN                NaN  \\\n1              NaN          NaN                NaN   \n2              NaN          NaN                NaN   \n3              NaN          NaN                NaN   \n4              NaN          NaN                NaN   \n...            ...          ...                ...   \n51897          NaN          NaN                NaN   \n51898          NaN          NaN                NaN   \n51899          NaN          NaN                NaN   \n51902          NaN          NaN                NaN   \n51903          NaN          NaN                NaN   \n\n                                       Author Full Names   \n0              Yan, Zheng; Jing, Xuyang; Pedrycz, Witold  \\\n1      Sookhak, Mehdi; Yu, F. Richard; He, Ying; Tale...   \n2      Ning, Zhaolong; Dong, Peiran; Wang, Xiaojie; G...   \n3      Wang, Xiaoding; Garg, Sahil; Lin, Hui; Kaddoum...   \n4      Lu, Tianguang; Chen, Xinyu; McElroy, Michael B...   \n...                                                  ...   \n51897  Lai, Zhilu; Liu, Wei; Jian, Xudong; Bacsa, Kir...   \n51898       Wang, Hongcui; Roussel, Pierre; Denby, Bruce   \n51899  Zhang, Rui; Alpdogan, Serdar; Kong, Shiqi; Muh...   \n51902                           Chu, Wenping; Song, Yang   \n51903  Lai, Chun Sing; Jia, Youwei; Dong, Zhekang; Wa...   \n\n      Book Author Full Names Group Authors   \n0                        NaN           NaN  \\\n1                        NaN           NaN   \n2                        NaN           NaN   \n3                        NaN           NaN   \n4                        NaN           NaN   \n...                      ...           ...   \n51897                    NaN           NaN   \n51898                    NaN           NaN   \n51899                    NaN           NaN   \n51902                    NaN           NaN   \n51903                    NaN           NaN   \n\n                                           Article Title   \n0      LEFusing and mining opinions for reputation ge...  \\\n1      FOG VEHICULAR COMPUTING Augmentation of Fog Co...   \n2      Deep Reinforcement Learning for Intelligent In...   \n3      An Intelligent UAV based Data Aggregation Algo...   \n4      A Reinforcement Learning-Based Decision System...   \n...                                                  ...   \n51897  Neural modal ordinary differential equations: ...   \n51898  Improving ultrasound-based multimodal speech r...   \n51899  Application of computer-aided image reconstruc...   \n51902  Study on Dynamic Interaction of Railway Pantog...   \n51903   A Review of Technical Standards for Smart Cities   \n\n                                            Source Title  ...   \n0                                     INFORMATION FUSION  ...  \\\n1                     IEEE VEHICULAR TECHNOLOGY MAGAZINE  ...   \n2      IEEE TRANSACTIONS ON COGNITIVE COMMUNICATIONS ...  ...   \n3                                      COMPUTER NETWORKS  ...   \n4                        IEEE TRANSACTIONS ON SMART GRID  ...   \n...                                                  ...  ...   \n51897                           DATA-CENTRIC ENGINEERING  ...   \n51898                               JASA EXPRESS LETTERS  ...   \n51899                   EGYPTIAN JOURNAL OF NEUROSURGERY  ...   \n51902                                          VIBRATION  ...   \n51903                                 CLEAN TECHNOLOGIES  ...   \n\n        UT (Unique WOS ID)  issn_var      issn    Domain_English   \n0      WOS:000394070100013      issn  15662535  Applied Sciences  \\\n1      WOS:000408568800008      issn  15566072  Applied Sciences   \n2      WOS:000502789700018      issn  23327731  Applied Sciences   \n3      WOS:000626758800004      issn  13891286  Applied Sciences   \n4      WOS:000641976000028      issn  19493053  Applied Sciences   \n...                    ...       ...       ...               ...   \n51897  WOS:000906995300001     eissn       NaN  Applied Sciences   \n51898  WOS:000642230800005     eissn       NaN  Natural Sciences   \n51899  WOS:000807222600001     eissn       NaN   Health Sciences   \n51902  WOS:000661660800001     eissn       NaN  Applied Sciences   \n51903  WOS:000708219500008     eissn       NaN  Natural Sciences   \n\n                                  Field_English   \n0      Information & Communication Technologies  \\\n1      Information & Communication Technologies   \n2      Information & Communication Technologies   \n3      Information & Communication Technologies   \n4             Enabling & Strategic Technologies   \n...                                         ...   \n51897  Information & Communication Technologies   \n51898                       Physics & Astronomy   \n51899                         Clinical Medicine   \n51902                               Engineering   \n51903            Earth & Environmental Sciences   \n\n                                 SubField_English 2.00 SEQ   \n0      Artificial Intelligence & Image Processing       31  \\\n1                 Networking & Telecommunications       37   \n2                 Networking & Telecommunications       37   \n3                 Networking & Telecommunications       37   \n4                                          Energy       14   \n...                                           ...      ...   \n51897  Artificial Intelligence & Image Processing      NaN   \n51898                                   Acoustics      NaN   \n51899                    Neurology & Neurosurgery      NaN   \n51902         Mechanical Engineering & Transports      NaN   \n51903                      Environmental Sciences      NaN   \n\n                                            Source_title         srcid   \n0                                     Information Fusion  2.609900e+04  \\\n1                     IEEE Vehicular Technology Magazine  5.200153e+09   \n2      IEEE Transactions on Cognitive Communications ...  2.110085e+10   \n3                                      Computer Networks  2.681100e+04   \n4                        IEEE Transactions on Smart Grid  1.970017e+10   \n...                                                  ...           ...   \n51897                                                NaN           NaN   \n51898                                                NaN           NaN   \n51899                                                NaN           NaN   \n51902                                                NaN           NaN   \n51903                                                NaN           NaN   \n\n      issn_type  \n0         issn1  \n1         issn1  \n2         issn1  \n3         issn1  \n4         issn2  \n...         ...  \n51897       NaN  \n51898       NaN  \n51899       NaN  \n51902       NaN  \n51903       NaN  \n\n[46060 rows x 80 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Publication Type</th>\n      <th>Authors</th>\n      <th>Book Authors</th>\n      <th>Book Editors</th>\n      <th>Book Group Authors</th>\n      <th>Author Full Names</th>\n      <th>Book Author Full Names</th>\n      <th>Group Authors</th>\n      <th>Article Title</th>\n      <th>Source Title</th>\n      <th>...</th>\n      <th>UT (Unique WOS ID)</th>\n      <th>issn_var</th>\n      <th>issn</th>\n      <th>Domain_English</th>\n      <th>Field_English</th>\n      <th>SubField_English</th>\n      <th>2.00 SEQ</th>\n      <th>Source_title</th>\n      <th>srcid</th>\n      <th>issn_type</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>J</td>\n      <td>Yan, Z; Jing, XY; Pedrycz, W</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Yan, Zheng; Jing, Xuyang; Pedrycz, Witold</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>LEFusing and mining opinions for reputation ge...</td>\n      <td>INFORMATION FUSION</td>\n      <td>...</td>\n      <td>WOS:000394070100013</td>\n      <td>issn</td>\n      <td>15662535</td>\n      <td>Applied Sciences</td>\n      <td>Information &amp; Communication Technologies</td>\n      <td>Artificial Intelligence &amp; Image Processing</td>\n      <td>31</td>\n      <td>Information Fusion</td>\n      <td>2.609900e+04</td>\n      <td>issn1</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>J</td>\n      <td>Sookhak, M; Yu, FR; He, Y; Talebian, H; Safa, ...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Sookhak, Mehdi; Yu, F. Richard; He, Ying; Tale...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>FOG VEHICULAR COMPUTING Augmentation of Fog Co...</td>\n      <td>IEEE VEHICULAR TECHNOLOGY MAGAZINE</td>\n      <td>...</td>\n      <td>WOS:000408568800008</td>\n      <td>issn</td>\n      <td>15566072</td>\n      <td>Applied Sciences</td>\n      <td>Information &amp; Communication Technologies</td>\n      <td>Networking &amp; Telecommunications</td>\n      <td>37</td>\n      <td>IEEE Vehicular Technology Magazine</td>\n      <td>5.200153e+09</td>\n      <td>issn1</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>J</td>\n      <td>Ning, ZL; Dong, PR; Wang, XJ; Guo, L; Rodrigue...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Ning, Zhaolong; Dong, Peiran; Wang, Xiaojie; G...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Deep Reinforcement Learning for Intelligent In...</td>\n      <td>IEEE TRANSACTIONS ON COGNITIVE COMMUNICATIONS ...</td>\n      <td>...</td>\n      <td>WOS:000502789700018</td>\n      <td>issn</td>\n      <td>23327731</td>\n      <td>Applied Sciences</td>\n      <td>Information &amp; Communication Technologies</td>\n      <td>Networking &amp; Telecommunications</td>\n      <td>37</td>\n      <td>IEEE Transactions on Cognitive Communications ...</td>\n      <td>2.110085e+10</td>\n      <td>issn1</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>J</td>\n      <td>Wang, XD; Garg, S; Lin, H; Kaddoum, G; Hu, J; ...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Wang, Xiaoding; Garg, Sahil; Lin, Hui; Kaddoum...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>An Intelligent UAV based Data Aggregation Algo...</td>\n      <td>COMPUTER NETWORKS</td>\n      <td>...</td>\n      <td>WOS:000626758800004</td>\n      <td>issn</td>\n      <td>13891286</td>\n      <td>Applied Sciences</td>\n      <td>Information &amp; Communication Technologies</td>\n      <td>Networking &amp; Telecommunications</td>\n      <td>37</td>\n      <td>Computer Networks</td>\n      <td>2.681100e+04</td>\n      <td>issn1</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>J</td>\n      <td>Lu, TG; Chen, XY; McElroy, MB; Nielsen, CP; Wu...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Lu, Tianguang; Chen, Xinyu; McElroy, Michael B...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>A Reinforcement Learning-Based Decision System...</td>\n      <td>IEEE TRANSACTIONS ON SMART GRID</td>\n      <td>...</td>\n      <td>WOS:000641976000028</td>\n      <td>issn</td>\n      <td>19493053</td>\n      <td>Applied Sciences</td>\n      <td>Enabling &amp; Strategic Technologies</td>\n      <td>Energy</td>\n      <td>14</td>\n      <td>IEEE Transactions on Smart Grid</td>\n      <td>1.970017e+10</td>\n      <td>issn2</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>51897</th>\n      <td>J</td>\n      <td>Lai, ZL; Liu, W; Jian, XD; Bacsa, K; Sun, LM; ...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Lai, Zhilu; Liu, Wei; Jian, Xudong; Bacsa, Kir...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Neural modal ordinary differential equations: ...</td>\n      <td>DATA-CENTRIC ENGINEERING</td>\n      <td>...</td>\n      <td>WOS:000906995300001</td>\n      <td>eissn</td>\n      <td>NaN</td>\n      <td>Applied Sciences</td>\n      <td>Information &amp; Communication Technologies</td>\n      <td>Artificial Intelligence &amp; Image Processing</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>51898</th>\n      <td>J</td>\n      <td>Wang, HC; Roussel, P; Denby, B</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Wang, Hongcui; Roussel, Pierre; Denby, Bruce</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Improving ultrasound-based multimodal speech r...</td>\n      <td>JASA EXPRESS LETTERS</td>\n      <td>...</td>\n      <td>WOS:000642230800005</td>\n      <td>eissn</td>\n      <td>NaN</td>\n      <td>Natural Sciences</td>\n      <td>Physics &amp; Astronomy</td>\n      <td>Acoustics</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>51899</th>\n      <td>J</td>\n      <td>Zhang, R; Alpdogan, S; Kong, SQ; Muhammad, S</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Zhang, Rui; Alpdogan, Serdar; Kong, Shiqi; Muh...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Application of computer-aided image reconstruc...</td>\n      <td>EGYPTIAN JOURNAL OF NEUROSURGERY</td>\n      <td>...</td>\n      <td>WOS:000807222600001</td>\n      <td>eissn</td>\n      <td>NaN</td>\n      <td>Health Sciences</td>\n      <td>Clinical Medicine</td>\n      <td>Neurology &amp; Neurosurgery</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>51902</th>\n      <td>J</td>\n      <td>Chu, WP; Song, Y</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Chu, Wenping; Song, Yang</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Study on Dynamic Interaction of Railway Pantog...</td>\n      <td>VIBRATION</td>\n      <td>...</td>\n      <td>WOS:000661660800001</td>\n      <td>eissn</td>\n      <td>NaN</td>\n      <td>Applied Sciences</td>\n      <td>Engineering</td>\n      <td>Mechanical Engineering &amp; Transports</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>51903</th>\n      <td>J</td>\n      <td>Lai, CS; Jia, YW; Dong, ZK; Wang, DX; Tao, YS;...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>Lai, Chun Sing; Jia, Youwei; Dong, Zhekang; Wa...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>A Review of Technical Standards for Smart Cities</td>\n      <td>CLEAN TECHNOLOGIES</td>\n      <td>...</td>\n      <td>WOS:000708219500008</td>\n      <td>eissn</td>\n      <td>NaN</td>\n      <td>Natural Sciences</td>\n      <td>Earth &amp; Environmental Sciences</td>\n      <td>Environmental Sciences</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n  </tbody>\n</table>\n<p>46060 rows × 80 columns</p>\n</div>"
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wos"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "['Domain_English', 'Field_English', 'SubField_English']"
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metrix_levels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "outputs": [],
   "source": [
    "record_countries = locations[[record_col,\"Country\"]].drop_duplicates()\n",
    "record_author_locations = author_locations[[record_col,\"author_str_id\",\"Country\"]].drop_duplicates()\n",
    "record_institution = univ_locations[[record_col,\"Institution\",\"Country\"]].drop_duplicates()\n",
    "country_types = locations[[\"Country\",\"Country_Type\"]].drop_duplicates()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "outputs": [],
   "source": [
    "# Basic network layout"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "outputs": [],
   "source": [
    "country_collabs = record_countries.merge(record_countries, on=record_col)\n",
    "country_collabs = country_collabs[country_collabs[\"Country_x\"]!=country_collabs[\"Country_y\"]]\n",
    "country_collabs[\"weight\"] = 0.5"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "outputs": [],
   "source": [
    "inst_collabs = record_institution.merge(record_institution, on=record_col)\n",
    "inst_collabs = inst_collabs[inst_collabs[\"Institution_x\"]!=inst_collabs[\"Institution_y\"]]\n",
    "inst_collabs[\"weight\"] = 0.5"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "outputs": [
    {
     "data": {
      "text/plain": "Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',\n       'Book Group Authors', 'Author Full Names', 'Book Author Full Names',\n       'Group Authors', 'Article Title', 'Source Title', 'Book Series Title',\n       'Book Series Subtitle', 'Language', 'Document Type', 'Conference Title',\n       'Conference Date', 'Conference Location', 'Conference Sponsor',\n       'Conference Host', 'Author Keywords', 'Keywords Plus', 'Abstract',\n       'Addresses', 'Affiliations', 'Reprint Addresses', 'Email Addresses',\n       'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Name Preferred',\n       'Funding Text', 'Cited References', 'Cited Reference Count',\n       'Times Cited, WoS Core', 'Times Cited, All Databases',\n       '180 Day Usage Count', 'Since 2013 Usage Count', 'Publisher',\n       'Publisher City', 'Publisher Address', 'ISSN', 'eISSN', 'ISBN',\n       'Journal Abbreviation', 'Journal ISO Abbreviation', 'Publication Date',\n       'Publication Year', 'Volume', 'Issue', 'Part Number', 'Supplement',\n       'Special Issue', 'Meeting Abstract', 'Start Page', 'End Page',\n       'Article Number', 'DOI', 'DOI Link', 'Book DOI', 'Early Access Date',\n       'Number of Pages', 'WoS Categories', 'Web of Science Index',\n       'Research Areas', 'IDS Number', 'Pubmed Id', 'Open Access Designations',\n       'Highly Cited Status', 'Hot Paper Status', 'Date of Export',\n       'UT (Unique WOS ID)', 'issn_var', 'issn', 'Domain_English',\n       'Field_English', 'SubField_English', '2.00 SEQ', 'Source_title',\n       'srcid', 'issn_type'],\n      dtype='object')"
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wos.columns"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "outputs": [
    {
     "data": {
      "text/plain": "['Authors',\n 'Book Authors',\n 'Book Editors',\n 'Book Group Authors',\n 'Author Full Names',\n 'Book Author Full Names',\n 'Group Authors',\n 'Addresses',\n 'Reprint Addresses',\n 'Email Addresses',\n 'Researcher Ids',\n 'ORCIDs',\n 'Publisher Address',\n '2.00 SEQ']"
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "drop_cols = [ws for ws in wos.columns if ((\"uthor\" in ws or \"ddress\" in ws or \"ORCID\" in\n",
    "                                           ws or \"esearcher\" in ws or \"ditor\" in ws or \"name\" in ws or 'SEQ' in ws) and \"eyword\" not in ws)]\n",
    "drop_cols"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "outputs": [],
   "source": [
    "outdir=\"wos_processed_data\""
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "outputs": [],
   "source": [
    "os.makedirs(outdir, exist_ok=True)\n",
    "\n",
    "wos.drop(columns=drop_cols).to_excel(f\"{outdir}/wos_processed.xlsx\", index=False)\n",
    "\n",
    "record_countries.to_excel(f\"{outdir}/wos_countries.xlsx\", index=False)\n",
    "\n",
    "record_author_locations.to_excel(f\"{outdir}/wos_author_locations.xlsx\", index=False)\n",
    "\n",
    "record_institution.to_excel(f\"{outdir}/wos_institution_locations.xlsx\", index=False)\n",
    "\n",
    "kw_df.to_excel(f\"{outdir}/wos_keywords.xlsx\", index=False)\n",
    "\n",
    "country_types.to_excel(f\"{outdir}/wos_country_types.xlsx\", index=False)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "outputs": [],
   "source": [
    "wos.drop(columns=drop_cols).to_csv(f\"{outdir}/wos_processed.csv\", index=False, sep='\\t')\n",
    "\n",
    "record_countries.to_csv(f\"{outdir}/wos_countries.csv\", index=False, sep='\\t')\n",
    "\n",
    "record_author_locations.to_csv(f\"{outdir}/wos_author_locations.csv\", index=False, sep='\\t')\n",
    "\n",
    "record_institution.to_csv(f\"{outdir}/wos_institution_locations.csv\", index=False, sep='\\t')\n",
    "\n",
    "kw_df.to_csv(f\"{outdir}/wos_keywords.csv\", index=False, sep='\\t')\n",
    "\n",
    "country_types.to_csv(f\"{outdir}/wos_country_types.csv\", index=False, sep='\\t')\n",
    "\n",
    "inst_collabs.to_csv(f\"{outdir}/wos_inst_collabs.csv\", index=False, sep='\\t')\n",
    "\n",
    "country_collabs.to_csv(f\"{outdir}/wos_country_collabs.csv\", index=False, sep='\\t')"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "outputs": [],
   "source": [
    "wos_areas.to_csv(f\"{outdir}/wos_research_areas.csv\", index=False, sep='\\t')\n",
    "\n",
    "wos_subcat.to_csv(f\"{outdir}/wos_categories.csv\", index=False, sep='\\t')"
   ],
   "metadata": {
    "collapsed": false
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}