You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
blabla/WOS/wos_processing_legacy.ipynb

1300 lines
60 KiB
Plaintext

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "code",
"execution_count": 7,
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import os\n",
"import shutil\n",
"from flashgeotext.geotext import GeoText\n",
"import re\n",
"# import spacy\n",
"#\n",
"# nlp = spacy.load(\"en_core_web_lg\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"record_col=\"UT (Unique WOS ID)\"\n",
"outfile = r\"C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\WOS\\wos_extract\\wos_records_concat.csv\""
]
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of initial records: 27672\n",
"Number of filtered records: 24653\n"
]
}
],
"source": [
"wos = pd.read_csv(outfile, sep=\"\\t\",low_memory=False)\n",
"print(f'Number of initial records: {len(wos)}')\n",
"metrix = pd.read_excel(\"sm_journal_classification.xlsx\", sheet_name=\"Journal_Classification\")\n",
"\n",
"\n",
"metrix = metrix.set_index([c for c in metrix.columns if \"issn\" not in c]).stack().reset_index()\n",
"metrix = metrix.rename(columns={'level_6':\"issn_type\", 0:\"issn\"})\n",
"metrix[\"issn\"]=metrix[\"issn\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
"\n",
"wos[\"issn\"] = wos[\"ISSN\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
"wos[\"eissn\"] = wos[\"eISSN\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
"wos = wos.set_index([c for c in wos.columns if \"issn\" not in c]).stack().reset_index()\n",
"wos = wos.rename(columns={'level_71':\"issn_var\", 0:\"issn\"})\n",
"\n",
"wos_merge = wos.merge(metrix, on=\"issn\", how=\"left\")\n",
"wos = wos_merge.sort_values(by=\"issn_var\",ascending=False).drop_duplicates(subset=record_col)\n",
"\n",
"# drop entries not indexed by metrix\n",
"wos = wos[~wos[\"Domain_English\"].isna()]\n",
"# drop duplicates (based on doi)\n",
"wos = wos[~((~wos[\"DOI\"].isna())&(wos[\"DOI\"].duplicated(False)))]\n",
"wos = wos.drop_duplicates(subset=[\"Publication Type\",\"Document Type\",\"Authors\",\"Article Title\",\"Source Title\",\"Publication Year\"])\n",
"print(f'Number of filtered records: {len(wos)}')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [
{
"data": {
"text/plain": " Article Title \n23070 Stochastic bias of colour-selected BAO tracers... \\\n30139 A novel integrative approach elucidates fine-s... \n4538 Optimal Number of Clusters by Measuring Simila... \n34242 Analyzing the Noise Robustness of Deep Neural ... \n26727 Learning to Prompt for Open-Vocabulary Object ... \n... ... \n3290 Research on Reverse Skyline Query Algorithm Ba... \n45159 Using Recurrent Neural Network for Intelligent... \n21653 Output-Bounded and RBFNN-Based Position Tracki... \n43983 A Novel 3D Intelligent Cluster Method for Mali... \n11880 BlockHammer: Improving Flash Reliability by Ex... \n\n Keywords Plus \n23070 DIGITAL SKY SURVEY; BARYON ACOUSTIC-OSCILLATIO... \\\n30139 CHAOTIC GENETIC PATCHINESS; PELAGIC LARVAL DUR... \n4538 VALIDATION; ALGORITHM; TUTORIAL \n34242 VISUAL ANALYTICS \n26727 NaN \n... ... \n3290 MAPREDUCE \n45159 NaN \n21653 IMPEDANCE CONTROL; ROBOT \n43983 NETWORK INTRUSION DETECTION; DDOS DETECTION; A... \n11880 MEMORY; PERFORMANCE; RETENTION; ENDURANCE; OPT... \n\n Author Keywords \n23070 galaxies: evolution; galaxies: haloes; galaxie... \n30139 NaN \n4538 Event-related potentials; Optimal number of cl... \n34242 Neurons; Visualization; Data visualization; Fe... \n26727 NaN \n... ... \n3290 Big Data; Database Management; Database Query;... \n45159 water resources; intelligent prediction; water... \n21653 Security tele-surgery; RBFNN; bilateral positi... \n43983 Auto encoder; DDos detection; Attack classific... \n11880 Reliability; Three-dimensional displays; Error... \n\n[100 rows x 3 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Article Title</th>\n <th>Keywords Plus</th>\n <th>Author Keywords</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>23070</th>\n <td>Stochastic bias of colour-selected BAO tracers...</td>\n <td>DIGITAL SKY SURVEY; BARYON ACOUSTIC-OSCILLATIO...</td>\n <td>galaxies: evolution; galaxies: haloes; galaxie...</td>\n </tr>\n <tr>\n <th>30139</th>\n <td>A novel integrative approach elucidates fine-s...</td>\n <td>CHAOTIC GENETIC PATCHINESS; PELAGIC LARVAL DUR...</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>4538</th>\n <td>Optimal Number of Clusters by Measuring Simila...</td>\n <td>VALIDATION; ALGORITHM; TUTORIAL</td>\n <td>Event-related potentials; Optimal number of cl...</td>\n </tr>\n <tr>\n <th>34242</th>\n <td>Analyzing the Noise Robustness of Deep Neural ...</td>\n <td>VISUAL ANALYTICS</td>\n <td>Neurons; Visualization; Data visualization; Fe...</td>\n </tr>\n <tr>\n <th>26727</th>\n <td>Learning to Prompt for Open-Vocabulary Object ...</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>3290</th>\n <td>Research on Reverse Skyline Query Algorithm Ba...</td>\n <td>MAPREDUCE</td>\n <td>Big Data; Database Management; Database Query;...</td>\n </tr>\n <tr>\n <th>45159</th>\n <td>Using Recurrent Neural Network for Intelligent...</td>\n <td>NaN</td>\n <td>water resources; intelligent prediction; water...</td>\n </tr>\n <tr>\n <th>21653</th>\n <td>Output-Bounded and RBFNN-Based Position Tracki...</td>\n <td>IMPEDANCE CONTROL; ROBOT</td>\n <td>Security tele-surgery; RBFNN; bilateral positi...</td>\n </tr>\n <tr>\n <th>43983</th>\n <td>A Novel 3D Intelligent Cluster Method for Mali...</td>\n <td>NETWORK INTRUSION DETECTION; DDOS DETECTION; A...</td>\n <td>Auto encoder; DDos detection; Attack classific...</td>\n </tr>\n <tr>\n <th>11880</th>\n <td>BlockHammer: Improving Flash Reliability by Ex...</td>\n <td>MEMORY; PERFORMANCE; RETENTION; ENDURANCE; OPT...</td>\n <td>Reliability; Three-dimensional displays; Error...</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 3 columns</p>\n</div>"
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos[[\"Article Title\",\"Keywords Plus\",\"Author Keywords\"]].sample(100)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) keyword_all\n0 WOS:000208863600266 ANME\n1 WOS:000208863600266 PYROSEQUENCING\n2 WOS:000208863600266 AOM\n3 WOS:000208863600266 COMMUNITY STRUCTURE\n4 WOS:000208863600266 NYEGGA\n.. ... ...\n99 WOS:000286328200009 NORTH-EAST ASIA\n100 WOS:000286328200009 PLEISTOCENE\n101 WOS:000286328200009 SAKHALIN ISLAND\n102 WOS:000286373200134 NEURAL NETWORKS\n103 WOS:000286373200134 FUZZY LOGIC\n\n[100 rows x 2 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>keyword_all</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000208863600266</td>\n <td>ANME</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000208863600266</td>\n <td>PYROSEQUENCING</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000208863600266</td>\n <td>AOM</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000208863600266</td>\n <td>COMMUNITY STRUCTURE</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000208863600266</td>\n <td>NYEGGA</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>99</th>\n <td>WOS:000286328200009</td>\n <td>NORTH-EAST ASIA</td>\n </tr>\n <tr>\n <th>100</th>\n <td>WOS:000286328200009</td>\n <td>PLEISTOCENE</td>\n </tr>\n <tr>\n <th>101</th>\n <td>WOS:000286328200009</td>\n <td>SAKHALIN ISLAND</td>\n </tr>\n <tr>\n <th>102</th>\n <td>WOS:000286373200134</td>\n <td>NEURAL NETWORKS</td>\n </tr>\n <tr>\n <th>103</th>\n <td>WOS:000286373200134</td>\n <td>FUZZY LOGIC</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 2 columns</p>\n</div>"
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kw_df = pd.DataFrame()\n",
"for c in [\"Keywords Plus\",\"Author Keywords\"]:\n",
" kwp = wos.groupby(record_col)[c].apply(lambda x: x.str.split(';')).explode().str.strip().str.upper()\n",
" kwp.name = 'keyword_all'\n",
" kw_df = pd.concat([kwp.reset_index(),kw_df],ignore_index=True)\n",
"kw_df = kw_df[~kw_df[\"keyword_all\"].isna()].copy().drop(columns=\"level_1\").drop_duplicates()\n",
"kw_df[\"keyword_all\"] = kw_df[\"keyword_all\"].apply(lambda x: re.sub(\"[\\(\\[].*?[\\)\\]]\", \"\", x))\n",
"kw_df.head(100)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) keyword_all\n0 WOS:000208863600266 ANME; PYROSEQUENCING; AOM; COMMUNITY STRUCTURE...\n1 WOS:000209236900010 ACTIVE PERCEPTION; SPARSE CODING; REINFORCEMEN...\n2 WOS:000209331600009 SLEEP PATTERN; ELDER-CARE; PRESSURE SENSOR; NA...\n3 WOS:000209571700012 PERSONALIZED MEDICINE; COMPLEX NETWORK; CLINIC...\n4 WOS:000209810700046 CORROSION CHARACTERIZATION; FEATURE EXTRACTION...",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>keyword_all</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000208863600266</td>\n <td>ANME; PYROSEQUENCING; AOM; COMMUNITY STRUCTURE...</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000209236900010</td>\n <td>ACTIVE PERCEPTION; SPARSE CODING; REINFORCEMEN...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000209331600009</td>\n <td>SLEEP PATTERN; ELDER-CARE; PRESSURE SENSOR; NA...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000209571700012</td>\n <td>PERSONALIZED MEDICINE; COMPLEX NETWORK; CLINIC...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000209810700046</td>\n <td>CORROSION CHARACTERIZATION; FEATURE EXTRACTION...</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos_kwd_concat = kw_df.groupby(record_col, as_index=False).agg({'keyword_all': '; '.join})\n",
"wos_kwd_concat.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [],
"source": [
"# from keybert import KeyBERT\n",
"#\n",
"# kw_model = KeyBERT(model='all-mpnet-base-v2')\n",
"#\n",
"# def kwd_extract(text):\n",
"# keywords = kw_model.extract_keywords(text,\n",
"#\n",
"# keyphrase_ngram_range=(1, 2),\n",
"#\n",
"# stop_words='english',\n",
"#\n",
"# highlight=False,\n",
"#\n",
"# top_n=3)\n",
"# return \"; \".join([i[0].upper() for i in keywords])\n",
"#\n",
"# kwd_extract(text=\"Artificial Intelligence: New Frontiers in Real-Time Inverse Scattering and Electromagnetic Imaging - In recent years, artificial intelligence (AI) techniques have been developed rapidly. With the ...\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 14,
"outputs": [
{
"data": {
"text/plain": "Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',\n 'Book Group Authors', 'Author Full Names', 'Book Author Full Names',\n 'Group Authors', 'Article Title', 'Source Title', 'Book Series Title',\n 'Book Series Subtitle', 'Language', 'Document Type', 'Conference Title',\n 'Conference Date', 'Conference Location', 'Conference Sponsor',\n 'Conference Host', 'Author Keywords', 'Keywords Plus', 'Abstract',\n 'Addresses', 'Affiliations', 'Reprint Addresses', 'Email Addresses',\n 'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Name Preferred',\n 'Funding Text', 'Cited References', 'Cited Reference Count',\n 'Times Cited, WoS Core', 'Times Cited, All Databases',\n '180 Day Usage Count', 'Since 2013 Usage Count', 'Publisher',\n 'Publisher City', 'Publisher Address', 'ISSN', 'eISSN', 'ISBN',\n 'Journal Abbreviation', 'Journal ISO Abbreviation', 'Publication Date',\n 'Publication Year', 'Volume', 'Issue', 'Part Number', 'Supplement',\n 'Special Issue', 'Meeting Abstract', 'Start Page', 'End Page',\n 'Article Number', 'DOI', 'DOI Link', 'Book DOI', 'Early Access Date',\n 'Number of Pages', 'WoS Categories', 'Web of Science Index',\n 'Research Areas', 'IDS Number', 'Pubmed Id', 'Open Access Designations',\n 'Highly Cited Status', 'Hot Paper Status', 'Date of Export',\n 'UT (Unique WOS ID)', 'issn_var', 'issn', 'Domain_English',\n 'Field_English', 'SubField_English', '2.00 SEQ', 'Source_title',\n 'srcid', 'issn_type'],\n dtype='object')"
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wos.columns"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"geotext = GeoText()\n",
"\n",
"def extract_location(input_text, key='countries'):\n",
" anomalies = {\"Malta\":\"Malta\",\n",
" \"Mongolia\":\"Mongolia\",\n",
" \"Quatar\":\"Qatar\",\n",
" \"Qatar\":\"Qatar\",\n",
" \"Ethiop\":\"Ethiopia\",\n",
" \"Nigeria\":\"Nigeria\",\n",
" \"BELAR\":\"Belarus\",\n",
" \"Venezuela\":\"Venezuela\",\n",
" \"Cyprus\":\"Cyprus\",\n",
" \"Ecuador\":\"Ecuador\",\n",
" \"U Arab\":\"United Arab Emirates\",\n",
" \"Syria\":\"Syria\",\n",
" \"Uganda\":\"Uganda\",\n",
" \"Yemen\":\"Yemen\",\n",
" \"Mali\":\"Mali\",\n",
" \"Senegal\":\"Senegal\",\n",
" \"Vatican\":\"Vatican\",\n",
" \"Uruguay\":\"Uruguay\",\n",
" \"Panama\":\"Panama\",\n",
" \"Fiji\":\"Fiji\",\n",
" \"Faroe\":\"Faroe Islands\",\n",
" \"Macedonia\":\"Macedonia\",\n",
" 'Mozambique':'Mozambique',\n",
" \"Kuwait\":\"Kuwait\",\n",
" \"Libya\":\"Libya\",\n",
" \"Turkiy\":\"Turkey\",\n",
" \"Liberia\":\"Liberia\",\n",
" \"Namibia\":\"Namibia\",\n",
" \"Ivoire\":\"Ivory Coast\",\n",
" \"Guatemala\":\"Gutemala\",\n",
" \"Paraguay\":\"Paraguay\",\n",
" \"Honduras\":\"Honduras\",\n",
" \"Nicaragua\":\"Nicaragua\",\n",
" \"Trinidad\":\"Trinidad & Tobago\",\n",
" \"Liechtenstein\":\"Liechtenstein\",\n",
" \"Greenland\":\"Denmark\"}\n",
"\n",
" extracted = geotext.extract(input_text=input_text)\n",
" found = extracted[key].keys()\n",
" if len(sorted(found))>0:\n",
" return sorted(found)[0]\n",
" elif key=='countries':\n",
" for i in ['Scotland','Wales','England', 'N Ireland']:\n",
" if i in input_text:\n",
" return 'United Kingdom'\n",
" for j in anomalies.keys():\n",
" if j in input_text:\n",
" return anomalies.get(j)\n",
" else:\n",
" return None\n",
"\n",
"with open('../eu_members.txt',\"r\") as f:\n",
" eu_countries=f.readline().split(\",\")\n",
" eu_countries=[i.strip() for i in eu_countries]\n",
"\n",
"def country_type(country):\n",
" if country in eu_countries:\n",
" return \"EU\"\n",
" elif country==\"China\":\n",
" return \"China\"\n",
" elif country in [\"Switzerland\", 'Norway','United Kingdom']:\n",
" return \"Non-EU associate\"\n",
" else:\n",
" return \"Other\"\n"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"locations = wos.groupby(record_col)[\"Addresses\"].apply(lambda x: x.str.split('[')).explode().reset_index().drop(columns=\"level_1\")\n",
"locations = locations[locations[\"Addresses\"]!=\"\"].copy()\n",
"locations[\"Address\"] = locations[\"Addresses\"].apply(lambda x:x.split(\"]\")[-1])\n",
"locations[\"Authors_of_address\"] = locations[\"Addresses\"].apply(lambda x:x.split(\"]\")[0])\n",
"locations[\"Country\"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='countries'))\n",
"locations[\"City\"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='cities'))\n",
"locations[\"Country_Type\"] = locations[\"Country\"].apply(lambda x: country_type(x))"
]
},
{
"cell_type": "code",
"execution_count": 32,
"outputs": [],
"source": [
"scope_types = [\"EU\",\"China\",\"Non-EU associate\"]\n",
"locations=locations[locations[\"Country_Type\"].isin(scope_types)]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Address \n1 WOS:000208863600266 Univ Bergen, Ctr Geobiol, Dept Biol, N-5020 B... \\\n2 WOS:000208863600266 Chinese Acad Sci, Guangzhou Inst Geochem, Gua... \n3 WOS:000208863600266 Univ Bergen, Dept Earth Sci, N-5020 Bergen, N... \n5 WOS:000209236900010 Goethe Univ Frankfurt, Frankfurt Inst Adv Stu... \n6 WOS:000209236900010 Ecole Normale Super Cachan Bretagne, Bruz, Fr... \n\n Country City Country_Type \n1 Norway Bergen Non-EU associate \\\n2 China Guangzhou China \n3 Norway Bergen Non-EU associate \n5 Germany Frankfurt (Oder) EU \n6 France Cachan EU \n\n Institution \n1 Univ Bergen \n2 Chinese Acad Sci \n3 Univ Bergen \n5 Goethe Univ Frankfurt \n6 Ecole Normale Super Cachan Bretagne ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Address</th>\n <th>Country</th>\n <th>City</th>\n <th>Country_Type</th>\n <th>Institution</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>1</th>\n <td>WOS:000208863600266</td>\n <td>Univ Bergen, Ctr Geobiol, Dept Biol, N-5020 B...</td>\n <td>Norway</td>\n <td>Bergen</td>\n <td>Non-EU associate</td>\n <td>Univ Bergen</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000208863600266</td>\n <td>Chinese Acad Sci, Guangzhou Inst Geochem, Gua...</td>\n <td>China</td>\n <td>Guangzhou</td>\n <td>China</td>\n <td>Chinese Acad Sci</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000208863600266</td>\n <td>Univ Bergen, Dept Earth Sci, N-5020 Bergen, N...</td>\n <td>Norway</td>\n <td>Bergen</td>\n <td>Non-EU associate</td>\n <td>Univ Bergen</td>\n </tr>\n <tr>\n <th>5</th>\n <td>WOS:000209236900010</td>\n <td>Goethe Univ Frankfurt, Frankfurt Inst Adv Stu...</td>\n <td>Germany</td>\n <td>Frankfurt (Oder)</td>\n <td>EU</td>\n <td>Goethe Univ Frankfurt</td>\n </tr>\n <tr>\n <th>6</th>\n <td>WOS:000209236900010</td>\n <td>Ecole Normale Super Cachan Bretagne, Bruz, Fr...</td>\n <td>France</td>\n <td>Cachan</td>\n <td>EU</td>\n <td>Ecole Normale Super Cachan Bretagne</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ_locations = locations[[record_col,\"Address\",\"Country\",\"City\",\"Country_Type\"]].copy()\n",
"univ_locations[\"Institution\"] = univ_locations[\"Address\"].apply(lambda x: x.split(\",\")[0])\n",
"univ_locations = univ_locations.drop_duplicates()\n",
"univ_locations.head()"
]
},
{
"cell_type": "code",
"execution_count": 38,
"outputs": [],
"source": [
"import hashlib\n",
"\n",
"def md5hash(s: str):\n",
" return hashlib.md5(s.encode('utf-8')).hexdigest()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Country Country_Type \n0 WOS:000208863600266 China China \\\n1 WOS:000208863600266 Norway Non-EU associate \n2 WOS:000208863600266 Norway Non-EU associate \n3 WOS:000208863600266 Norway Non-EU associate \n4 WOS:000208863600266 Norway Non-EU associate \n\n author_str_id \n0 5dfb4f0408a2cc8b7f36f5516938b62c \n1 d603b89121a1f279bf03b6f65d1389fa \n2 2fcb84e544f1558ead61dcf846027b7d \n3 6550a1d5fbd1b643f4732d40f2ed4d78 \n4 56485e2bd170d199887af88f3d0a9777 ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Country</th>\n <th>Country_Type</th>\n <th>author_str_id</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000208863600266</td>\n <td>China</td>\n <td>China</td>\n <td>5dfb4f0408a2cc8b7f36f5516938b62c</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000208863600266</td>\n <td>Norway</td>\n <td>Non-EU associate</td>\n <td>d603b89121a1f279bf03b6f65d1389fa</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000208863600266</td>\n <td>Norway</td>\n <td>Non-EU associate</td>\n <td>2fcb84e544f1558ead61dcf846027b7d</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000208863600266</td>\n <td>Norway</td>\n <td>Non-EU associate</td>\n <td>6550a1d5fbd1b643f4732d40f2ed4d78</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000208863600266</td>\n <td>Norway</td>\n <td>Non-EU associate</td>\n <td>56485e2bd170d199887af88f3d0a9777</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"author_locations = locations.groupby([record_col,\"Country\",\"Country_Type\"])[\"Authors_of_address\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_3\")\n",
"author_locations[\"Author_name\"] = author_locations[\"Authors_of_address\"].str.strip()\n",
"author_locations = author_locations.drop(columns=\"Authors_of_address\")\n",
"author_locations[\"author_str_id\"] = author_locations[\"Author_name\"].apply(lambda x:''.join(filter(str.isalnum, x.lower())))\n",
"author_locations[\"author_str_id\"] = author_locations[\"author_str_id\"].apply(md5hash)\n",
"author_locations = author_locations.drop(columns=\"Author_name\")\n",
"author_locations.head()"
]
},
{
"cell_type": "code",
"execution_count": 44,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Country Country_Type \n0 WOS:000208863600266 China China \\\n6 WOS:000209236900010 China China \n7 WOS:000209236900010 China China \n8 WOS:000209236900010 France EU \n10 WOS:000209236900010 Germany EU \n... ... ... ... \n321236 WOS:000953367000001 China China \n321237 WOS:000953367000001 China China \n321238 WOS:000953367000001 China China \n321239 WOS:000953367000001 China China \n321241 WOS:000953367000001 United Kingdom Non-EU associate \n\n author_str_id \n0 5dfb4f0408a2cc8b7f36f5516938b62c \n6 b406b8485c286091a46aca4999f294d3 \n7 abf37b879540b7c2eeb86787a467de29 \n8 2c559a54c654ab6dbc23d20ae82a0501 \n10 2c559a54c654ab6dbc23d20ae82a0501 \n... ... \n321236 99ef5c82ba66e07f9aa2d3f9fc7c45f7 \n321237 d013bf53d094540f90db9224b3eb9922 \n321238 702962f6fe47bac08520ae556a8e0e02 \n321239 99ef5c82ba66e07f9aa2d3f9fc7c45f7 \n321241 9cc42be570a5464bca0ea4b6b39d0271 \n\n[277884 rows x 4 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Country</th>\n <th>Country_Type</th>\n <th>author_str_id</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000208863600266</td>\n <td>China</td>\n <td>China</td>\n <td>5dfb4f0408a2cc8b7f36f5516938b62c</td>\n </tr>\n <tr>\n <th>6</th>\n <td>WOS:000209236900010</td>\n <td>China</td>\n <td>China</td>\n <td>b406b8485c286091a46aca4999f294d3</td>\n </tr>\n <tr>\n <th>7</th>\n <td>WOS:000209236900010</td>\n <td>China</td>\n <td>China</td>\n <td>abf37b879540b7c2eeb86787a467de29</td>\n </tr>\n <tr>\n <th>8</th>\n <td>WOS:000209236900010</td>\n <td>France</td>\n <td>EU</td>\n <td>2c559a54c654ab6dbc23d20ae82a0501</td>\n </tr>\n <tr>\n <th>10</th>\n <td>WOS:000209236900010</td>\n <td>Germany</td>\n <td>EU</td>\n <td>2c559a54c654ab6dbc23d20ae82a0501</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>321236</th>\n <td>WOS:000953367000001</td>\n <td>China</td>\n <td>China</td>\n <td>99ef5c82ba66e07f9aa2d3f9fc7c45f7</td>\n </tr>\n <tr>\n <th>321237</th>\n <td>WOS:000953367000001</td>\n <td>China</td>\n <td>China</td>\n <td>d013bf53d094540f90db9224b3eb9922</td>\n </tr>\n <tr>\n <th>321238</th>\n <td>WOS:000953367000001</td>\n <td>China</td>\n <td>China</td>\n <td>702962f6fe47bac08520ae556a8e0e02</td>\n </tr>\n <tr>\n <th>321239</th>\n <td>WOS:000953367000001</td>\n <td>China</td>\n <td>China</td>\n <td>99ef5c82ba66e07f9aa2d3f9fc7c45f7</td>\n </tr>\n <tr>\n <th>321241</th>\n <td>WOS:000953367000001</td>\n <td>United Kingdom</td>\n <td>Non-EU associate</td>\n <td>9cc42be570a5464bca0ea4b6b39d0271</td>\n </tr>\n </tbody>\n</table>\n<p>277884 rows × 4 columns</p>\n</div>"
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"author_locations[author_locations['author_str_id'].duplicated(False)]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"author_primary_region = author_locations.sort_values(by=\"Country_Type\").drop_duplicates(subset=[record_col,\"author_str_id\"])\n",
"# author_primary_region\n",
"\n",
"china=author_primary_region[author_primary_region[\"Country_Type\"]==\"China\"][record_col].unique()\n",
"eu=author_primary_region[author_primary_region[\"Country_Type\"]==\"EU\"][record_col].unique()\n",
"assoc=author_primary_region[author_primary_region[\"Country_Type\"]==\"Non-EU associate\"][record_col].unique()\n",
"\n",
"\n",
"# records that have distinct authors with different country affiliations\n",
"valid_scope = wos[((wos[record_col].isin(china))\n",
" &\n",
" ((wos[record_col].isin(eu))\n",
" |\n",
" (wos[record_col].isin(assoc))))][record_col].unique()"
]
},
{
"cell_type": "code",
"execution_count": 54,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Country Country_Type \n0 WOS:000208863600266 China China \\\n299168 WOS:000840488600001 China China \n299169 WOS:000840488600001 China China \n101376 WOS:000434663200012 China China \n101374 WOS:000434663200012 China China \n\n author_str_id \n0 5dfb4f0408a2cc8b7f36f5516938b62c \n299168 3462304c908993a828cdd0ff91ea4aaa \n299169 68ab59c442eb882af13a8273439cf840 \n101376 304c36b8b677f41a489894dc66a8461c \n101374 c04795fe195dcadb58bed5c81125ea35 ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Country</th>\n <th>Country_Type</th>\n <th>author_str_id</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000208863600266</td>\n <td>China</td>\n <td>China</td>\n <td>5dfb4f0408a2cc8b7f36f5516938b62c</td>\n </tr>\n <tr>\n <th>299168</th>\n <td>WOS:000840488600001</td>\n <td>China</td>\n <td>China</td>\n <td>3462304c908993a828cdd0ff91ea4aaa</td>\n </tr>\n <tr>\n <th>299169</th>\n <td>WOS:000840488600001</td>\n <td>China</td>\n <td>China</td>\n <td>68ab59c442eb882af13a8273439cf840</td>\n </tr>\n <tr>\n <th>101376</th>\n <td>WOS:000434663200012</td>\n <td>China</td>\n <td>China</td>\n <td>304c36b8b677f41a489894dc66a8461c</td>\n </tr>\n <tr>\n <th>101374</th>\n <td>WOS:000434663200012</td>\n <td>China</td>\n <td>China</td>\n <td>c04795fe195dcadb58bed5c81125ea35</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"author_primary_region.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of records: 24653\n",
"Number of valid records: 22081\n"
]
}
],
"source": [
"print(f'Number of records: {len(wos)}')\n",
"print(f'Number of valid cooperation records: {len(valid_scope)}')"
]
},
{
"cell_type": "code",
"execution_count": 66,
"outputs": [],
"source": [
"wos = wos[wos[record_col].isin(valid_scope)]\n",
"locations = locations[locations[record_col].isin(valid_scope)]\n",
"univ_locations = univ_locations[univ_locations[record_col].isin(valid_scope)]\n",
"author_locations = author_locations[author_locations[record_col].isin(valid_scope)]\n",
"author_primary_region = author_locations[author_locations[record_col].isin(valid_scope)]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"affiliations = wos.groupby(record_col)[\"Affiliations\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"affiliations[\"Affiliations\"] = affiliations[\"Affiliations\"].str.strip().str.upper().fillna(\"UNKNOWN\")\n",
"affiliations = affiliations.drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 67,
"outputs": [
{
"data": {
"text/plain": "Affiliations\nCHINESE ACADEMY OF SCIENCES 2688\nUNIVERSITY OF LONDON 1251\nUDICE-FRENCH RESEARCH UNIVERSITIES 1038\nCENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE (CNRS) 978\nTSINGHUA UNIVERSITY 960\n ... \nITALIAN INSTITUTE FOR GENOMIC MEDICINE (IIGM) 1\nSHENYANG INSTITUTE OF ENGINEERING 1\nXIANYANG NORMAL UNIVERSITY 1\nAGILENT TECHNOLOGIES 1\nUNIVERSIDAD DE ESPECIALIDADES ESPIRITU SANTO 1\nName: count, Length: 6117, dtype: int64"
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"affiliations[\"Affiliations\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 68,
"outputs": [
{
"data": {
"text/plain": "Institution\n Chinese Acad Sci 2708\n Tsinghua Univ 1170\n Shanghai Jiao Tong Univ 978\n Zhejiang Univ 902\n Univ Chinese Acad Sci 753\n ... \n Univ Namur 1\n Qianhai Inst Innovat Res 1\n UN 1\n Vienna Int Ctr 1\n Engn Res Ctr Urban Underground Space Dev Zhejiang 1\nName: count, Length: 11670, dtype: int64"
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ_locations[\"Institution\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 61,
"outputs": [
{
"data": {
"text/plain": "22081"
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ_locations[record_col].nunique()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 62,
"outputs": [
{
"data": {
"text/plain": "22081"
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"affiliations[record_col].nunique()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 63,
"outputs": [
{
"data": {
"text/plain": "99343"
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"univ_locations[\"Institution\"].value_counts().sum()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 64,
"outputs": [
{
"data": {
"text/plain": "130533"
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"affiliations[\"Affiliations\"].value_counts().sum()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"158916 162684\n"
]
}
],
"source": [
"aff_ = wos.groupby(record_col)[\"Affiliations\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"loc_ = wos.groupby(record_col)[\"Addresses\"].apply(lambda x: x.str.split('[')).explode().reset_index().drop(columns=\"level_1\")\n",
"print(len(aff_),len(loc_))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"unique_inst = sorted([i.split(\" \") for i in list(affiliations[\"Affiliations\"].unique())], key=len)\n",
"# unique_inst = [[''.join(filter(str.isalnum, i)) for i in i_list] for i_list in unique_inst]\n",
"unique_inst = [[i.strip(\",\").strip(\"(\").strip(\")\") for i in i_list] for i_list in unique_inst]\n",
"unique_inst"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def institution_chunk_norris(text):\n",
" for i in unique_inst:\n",
" text_split=text.split(\" \")\n",
" text_split=[i.strip(\",\").strip(\"(\").strip(\")\") for i in text_split]\n",
" overlap = all(token in text_split for token in i)\n",
" if overlap:\n",
" return (\" \".join(i))\n",
" return \"ERROR\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"affiliations[\"Affiliations_merged\"] = affiliations[\"Affiliations\"].apply(lambda x: institution_chunk_norris(x))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"affiliations[\"Affiliations\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"affiliations[\"Affiliations_merged\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"affiliations[affiliations[\"Affiliations_merged\"]==\"ERROR\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from nltk.metrics import edit_distance\n",
"from nltk.metrics import edit_distance_align\n",
"#results = df.apply(lambda x: edit_distance(x[\"column1\"], x[\"column2\"]), axis=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"affiliations = affiliations.merge(univ_locations, on=record_col)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"affiliations[\"Affiliations\"] = affiliations[\"Affiliations\"].str.upper().str.strip()\n",
"affiliations[\"Institution\"] = affiliations[\"Institution\"].str.upper().str.strip()\n",
"\n",
"affiliations[\"levehnstein\"] = affiliations.apply(\n",
" lambda x: edit_distance(x[\"Affiliations\"], x[\"Institution\"]), axis=1)\n",
"affiliations.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"def tok_overlap(lon_str, short_str):\n",
" l,s = lon_str.split(\" \"), short_str.split(\" \")\n",
" # create a pairwise distance matrix using NumPy\n",
" distance_matrix = np.fromfunction(np.vectorize(lambda i, j: edit_distance(l[int(i)], s[int(j)])), shape=(len(l), len(s)))\n",
" distance_frame = pd.DataFrame(data=distance_matrix, columns=s, index=l)\n",
"\n",
" return min(distance_frame.min().sum(),distance_frame.T.min().sum())\n",
"\n",
"# lon=(\"UNIVERSITY\",\"AMSTERDAM\",\"TECHNICAL\", \"LOCAL\")\n",
"# sho=(\"UNIV\",\"AMSTER\",\"TECH\",\"LOCAL\")\n",
"# tok_overlap(lon_str=\" \".join(lon),short_str=\" \".join(sho)).min().sum()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"tok_overlap(lon_str=\" \".join(l),short_str=\" \".join(s)).shape"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"affiliations[\"token_overlap\"] = affiliations.apply(\n",
" lambda x: tok_overlap(x[\"Affiliations\"], x[\"Institution\"]), axis=1)\n",
"affiliations.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"affiliations.sort_values(by=[record_col,\"Affiliations\",\"token_overlap\"], ascending=[False,False,True]).drop_duplicates(subset=[record_col,\"Affiliations\"])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"helper = affiliations.sort_values(by=[\"Affiliations\",\"token_overlap\"], ascending=[False,True]).drop_duplicates(subset=[record_col,\"Affiliations\"])\n",
"afh = helper[[\"Affiliations\",\"Institution\",\"Country\"]]\n",
"afh.groupby(\"Affiliations\")[\"Institution\"].agg(pd.Series.mode)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"afh.groupby(\"Affiliations\")[\"Country\"].agg(pd.Series.mode)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"helper1 = affiliations.sort_values(by=[\"Affiliations\",\"token_overlap\"], ascending=[False,True]).drop_duplicates(subset=[record_col,\"Affiliations\"])\n",
"afh1 = helper1[[\"Affiliations\",\"Institution\",\"City\",\"Country\",\"Country_Type\"]]\n",
"mode1_i = afh1.groupby(\"Affiliations\")[\"Institution\"].apply(pd.Series.mode).reset_index()\n",
"mode1_c = afh1.groupby(\"Affiliations\")[\"Country\"].apply(pd.Series.mode).reset_index()\n",
"mode1_city = afh1.groupby(\"Affiliations\")[\"City\"].apply(pd.Series.mode).reset_index()\n",
"mode1_type = afh1.groupby(\"Affiliations\")[\"Country_Type\"].apply(pd.Series.mode).reset_index()\n",
"\n",
"helper2 = affiliations.sort_values(by=[\"Affiliations\",\"levehnstein\"], ascending=[False,True]).drop_duplicates(subset=[record_col,\"Affiliations\"])\n",
"afh2 = helper2[[\"Affiliations\",\"Institution\",\"City\",\"Country\",\"Country_Type\"]]\n",
"mode2_i = afh2.groupby(\"Affiliations\")[\"Institution\"].apply(pd.Series.mode).reset_index()\n",
"mode2_c = afh2.groupby(\"Affiliations\")[\"Country\"].apply(pd.Series.mode).reset_index()\n",
"mode2_city = afh2.groupby(\"Affiliations\")[\"City\"].apply(pd.Series.mode).reset_index()\n",
"mode2_type = afh2.groupby(\"Affiliations\")[\"Country_Type\"].apply(pd.Series.mode).reset_index()\n",
"\n",
"mode_i = pd.concat([mode1_i,mode2_i],ignore_index=True)[[\"Affiliations\",\"Institution\"]].groupby(\"Affiliations\")[\"Institution\"].agg(\n",
" lambda x: pd.Series.mode(x)[0])\n",
"mode_c = pd.concat([mode1_c,mode2_c],ignore_index=True)[[\"Affiliations\",\"Country\"]].groupby(\"Affiliations\")[\"Country\"].agg(\n",
" lambda x: pd.Series.mode(x)[0])\n",
"mode_city = pd.concat([mode1_city,mode2_city],ignore_index=True)[[\"Affiliations\",\"City\"]].groupby(\"Affiliations\")[\"City\"].agg(\n",
" lambda x: pd.Series.mode(x)[0])\n",
"mode_type = pd.concat([mode1_type,mode2_type],ignore_index=True)[[\"Affiliations\",\"Country_Type\"]].groupby(\"Affiliations\")[\"Country_Type\"].agg(\n",
" lambda x: pd.Series.mode(x)[0])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"from functools import reduce\n",
"dfs = [mode_i, mode_c, mode_city, mode_type]\n",
"mode_final = reduce(lambda left,right: pd.merge(left,right,on='Affiliations'), dfs)\n",
"mode_final = mode_final.reset_index()\n",
"mode_final.columns = [\"Affiliations\",\"Institution (short name from address)\",\"Country_candidate\",\"City_candidate\",\"Country_type_candidate\"]\n",
"mode_final"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"aff_lookup = affiliations[[\"Affiliations\",\"Institution\",\"levehnstein\"]].drop_duplicates().sort_values(by=[\"Affiliations\",\"levehnstein\"],ascending=[True,True])\n",
"aff_lookup"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"aff_lookup_levehnstein = aff_lookup.copy()\n",
"aff_lookup_overlap = aff_lookup.copy()\n",
"inst_short = sorted([i.split(\" \") for i in list(aff_lookup_overlap[\"Institution\"].unique())], key=len)\n",
"inst_short"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"aff_lookup.drop_duplicates(subset=\"Affiliations\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# aff_m = pd.DataFrame(affiliations[\"Affiliations\"].unique(), columns=[\"Affiliations\"])\n",
"# inst_m = pd.DataFrame(affiliations[[\"Institution\",\"Country_Type\",\"Country\",\"City\"]].drop_duplicates(),columns=[\"Institution\",\"Country_Type\",\"Country\",\"City\"])\n",
"#\n",
"# aff_lookup = aff_m.merge(inst_m, how='cross')\n",
"#\n",
"# # aff_lookup[\"levehnstein\"] = aff_lookup.apply(\n",
"# # lambda x: edit_distance(x[\"Affiliations\"], x[\"Institution\"]), axis=1)\n",
"#\n",
"# aff_lookup.assign(distance=[*map(edit_distance, aff_lookup.Affiliations, aff_lookup.Institution)])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"affiliations[\"levehnstein\"].plot(kind=\"hist\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"affiliations[\"token_overlap\"].plot(kind=\"hist\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"affiliations[affiliations[\"Affiliations\"].str.contains(\"A*STAR\",regex=False)]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"affiliations = affiliations.sort_values(by=[record_col,\"Affiliations\",\"levehnstein\"], ascending=[False,False,True])\n",
"affiliations_merge = affiliations.drop_duplicates(subset=[record_col,\"Affiliations\"])\n",
"affiliations_merge.head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"wos_cat = wos.groupby(record_col)[\"WoS Categories\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"wos_cat[\"WoS Categories\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"wos_areas = wos.groupby(record_col)[\"Research Areas\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"wos_areas[\"Research Areas\"] = wos_areas[\"Research Areas\"].str.strip()\n",
"wos_areas[\"Research Areas\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"wos_areas[\"Research Areas\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"[c for c in wos.columns if \"_English\" in c]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import seaborn as sns\n",
"from matplotlib.ticker import MaxNLocator\n",
"import math"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"wos = wos[((wos[\"Publication Year\"]<2023) & (~wos['Domain_English'].isna()))]\n",
"\n",
"metrix_levels = [c for c in wos.columns if \"_English\" in c]\n",
"for m in metrix_levels:\n",
" wos[m] = wos[m].replace({\"article-level classification\":\"Miscellaneous\"})\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"wos"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"metrix_levels"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"outdir=\"wos_processed_data\""
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"os.makedirs(outdir, exist_ok=True)\n",
"\n",
"wos.to_excel(f\"{outdir}/wos_processed.xlsx\", index=False)\n",
"\n",
"locations.drop(columns=\"Addresses\").to_excel(f\"{outdir}/wos_addresses.xlsx\", index=False)\n",
"\n",
"affiliations_merge.to_excel(f\"{outdir}/wos_affiliations.xlsx\", index=False)\n",
"\n",
"author_locations.to_excel(f\"{outdir}/wos_author_locations.xlsx\", index=False)\n",
"\n",
"univ_locations.to_excel(f\"{outdir}/wos_univ_locations.xlsx\", index=False)\n",
"mode_final.to_excel(f\"{outdir}/wos_univ_locations_v2.xlsx\", index=False)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"kw_df.to_excel(f\"{outdir}/keywords.xlsx\", index=False)\n",
"wos_nlp.to_excel(f\"{outdir}/wos_nlp.xlsx\", index=False)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# wos_nlp = wos[[record_col,\"Article Title\",\"Abstract\"]]\n",
"wos = wos.merge(wos_kwd_concat, on=record_col)\n",
"wos[\"Document\"] = wos[\"Article Title\"].str.cat(wos[[\"Abstract\", \"keyword_all\"]].fillna(\"\"), sep=' - ')\n",
"# wos_kwd_test[\"BERT_KWDS\"] = wos_kwd_test[\"Document\"].map(kwd_extract)\n",
"\n",
"vectors = list()\n",
"vector_norms = list()\n",
"\n",
"for doc in nlp.pipe(wos['Document'].astype('unicode').values, batch_size=100,\n",
" n_process=4):\n",
" vectors.append(doc.vector)\n",
" vector_norms.append(doc.vector_norm)\n",
"\n",
"wos['vector'] = vectors\n",
"wos['vector_norm'] = vector_norms\n",
"wos['vector_norm'].plot(kind=\"hist\")\n",
"from sklearn.manifold import TSNE\n",
"import matplotlib.pyplot as plt\n",
"% matplotlib inline\n",
"\n",
"vector_data = pd.DataFrame(wos[\"vector\"].to_list(), index=wos[record_col]).reset_index()\n",
"vector_data.head()\n",
"\n",
"labels = vector_data.values[:, 0]\n",
"record_vectors = vector_data.values[:, 1:]\n",
"\n",
"tsne_model = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, random_state=42, metric='cosine')\n",
"tnse_2d = tsne_model.fit_transform(record_vectors)\n",
"tnse_data = pd.DataFrame(tnse_2d, index=labels).reset_index()\n",
"tnse_data.columns = [record_col, \"TNSE-X\", \"TNSE-Y\"]\n",
"tnse_data.head()\n",
"import seaborn as sns\n",
"\n",
"wos_plot = wos.merge(tnse_data, on=record_col)\n",
"\n",
"g = sns.scatterplot(wos_plot[wos_plot[\"Domain_English\"] != 'article-level classification'], x=\"TNSE-X\", y=\"TNSE-Y\",\n",
" hue='Domain_English', s=1)\n",
"g.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\n",
"wos_plot.head()\n",
"wos_nlp = wos_plot[[record_col, \"Document\", \"keyword_all\", \"TNSE-X\", \"TNSE-Y\"]]\n",
"g = sns.kdeplot(\n",
" data=wos_plot[wos_plot[\"Domain_English\"] != 'article-level classification'],\n",
" x=\"TNSE-X\", y=\"TNSE-Y\", hue='Domain_English',\n",
" thresh=.1,\n",
")\n",
"wos.columns"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Domain"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"group = 'Domain_English'\n",
"data = wos.groupby(group, as_index=False)[record_col].nunique().sort_values(ascending=False, by=record_col)\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sns.barplot(data, x=record_col, y=group)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# group = ['Publication Year','Domain_English']\n",
"# data = wos.groupby(group, as_index=False)[record_col].nunique().sort_values(ascending=False, by=group+[record_col])\n",
"# data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# group = ['Publication Year','Domain_English']\n",
"# data = wos.groupby(group)[record_col].nunique().unstack(fill_value=0).stack().reset_index().rename(columns={0:record_col}).sort_values(ascending=False, by=group+[record_col])\n",
"# data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# g=sns.lineplot(data.sort_values(ascending=True, by=group[-1]),y=record_col,x=group[0], hue=group[-1])\n",
"# g.set(xticks=list(range(2012,2022+1,2)))\n",
"# g.legend(title=None)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Field"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# group = ['Publication Year',\"Domain_English\",'Field_English']\n",
"# data = wos.groupby(group, as_index=False)[record_col].nunique().sort_values(ascending=False, by=group+[record_col])\n",
"# data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# g = sns.FacetGrid(data, col=\"Domain_English\", col_wrap=3, height=5)\n",
"# g.map_dataframe(sns.lineplot,x=group[0],y=record_col,hue=group[-1])\n",
"# g.set_titles(col_template=\"{col_name}\")\n",
"# g.set(xticks=list(range(2012,2022+1,2)))\n",
"# # g.add_legend()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# import matplotlib.pyplot as plt\n",
"# for cat in sorted(data[group[-2]].unique()):\n",
"# sub_data = data[data[group[-2]]==cat]\n",
"# sub_data = sub_data.complete({group[0]:range(int(data[group[0]].min()), int(data[group[0]].max()) + 1)}\n",
"# ,group[-1],fill_value=0)\n",
"# g=sns.lineplot(sub_data.sort_values(ascending=True, by=group[-1]),y=record_col,x=group[0], hue=group[-1])\n",
"# g.set(xticks=list(range(2012,2022+1,2)))\n",
"# g.legend(title=None)\n",
"# g.set_title(cat)\n",
"# g.yaxis.set_major_locator(MaxNLocator(integer=True))\n",
"# plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# SubField"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# group = ['Publication Year',\"Domain_English\",'Field_English',\"SubField_English\"]\n",
"# data = wos.groupby(group, as_index=False)[record_col].nunique().sort_values(ascending=False, by=group+[record_col])\n",
"# data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# import matplotlib.pyplot as plt\n",
"# for cat in sorted(data[group[-2]].unique()):\n",
"# sub_data = data[data[group[-2]]==cat]\n",
"# sub_data = sub_data.complete({group[0]:range(int(data[group[0]].min()), int(data[group[0]].max()) + 1)}\n",
"# ,group[-1],fill_value=0)\n",
"# g=sns.lineplot(sub_data.sort_values(ascending=True, by=group[-1]),y=record_col,x=group[0], hue=group[-1])\n",
"# g.set(xticks=list(range(2012,2022+1,2)))\n",
"# g.legend(title=None,bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., ncols=math.ceil(len(g.legend_.texts)/12))\n",
"# g.set_title(cat)\n",
"# plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 1
}