{ "cells": [ { "cell_type": "code", "execution_count": 23, "outputs": [], "source": [ "import pandas as pd\n", "# Importing libraries and module and some setting for notebook\n", "\n", "import pandas as pd\n", "import re\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "import numpy as np\n", "from scipy.sparse import csr_matrix\n", "import sparse_dot_topn.sparse_dot_topn as ct #Cosine Similarity\n", "import time\n", "from tqdm import tqdm" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 5, "outputs": [], "source": [ "def wikinorm(univ_string):\n", " from googlesearch import search\n", " from nltk.metrics import edit_distance\n", " from operator import itemgetter\n", " from numpy.random import default_rng\n", " rng = default_rng()\n", " results = search(univ_string, lang=\"en\", num_results=3,advanced=True, sleep_interval=rng.uniform(1, 5))\n", " univ_name = univ_string.split(\",\")[0]\n", " u_results = [i.title for i in results if \"Category:\" not in i.title]\n", " return sorted([tuple((j,edit_distance(univ_name, j))) for j in u_results],key=itemgetter(1))[0][0]\n" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 68, "outputs": [], "source": [ "def replace_uppercase_words(text):\n", " words = text.split()\n", " all_uppercase = all(word.isupper() for word in words)\n", " all_lowercase = all(word.islower() for word in words)\n", " if all_uppercase or all_lowercase:\n", " return text\n", " else:\n", " result = []\n", " for word in words:\n", " w = word.strip()\n", " if not w.isupper() and not w.islower():\n", " result.append(w)\n", " return \" \".join(result).strip()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 69, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO: Pandarallel will run on 4 workers.\n", "INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.\n", "\n", "WARNING: You are on Windows. If you detect any issue with pandarallel, be sure you checked out the Troubleshooting page:\n", "https://nalepae.github.io/pandarallel/troubleshooting/\n" ] }, { "data": { "text/plain": "VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=38767), Label(value='0 / 38767')))…", "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "ee2cde76498b4a46a2e87ea6c971aed9" } }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "155067\n" ] } ], "source": [ "outdir=\"wos_processed_data\"\n", "univ = pd.read_excel(f\"{outdir}/wos_institution_locations.xlsx\")\n", "\n", "from pandarallel import pandarallel\n", "pandarallel.initialize(progress_bar=True, nb_workers=4)\n", "\n", "univ[\"Institution_harm\"] = univ[\"Institution\"].parallel_apply(replace_uppercase_words)\n", "print(len(univ))" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 39, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) Institution Country \n84810 WOS:000571399800004 Univ Birmingham United Kingdom \\\n122264 WOS:000732918800001 Univ Southampton United Kingdom \n135675 WOS:000799234000004 UCL United Kingdom \n153134 WOS:000900724501058 Kore Univ Enna Italy \n51445 WOS:000455277600005 Univ Sheffield United Kingdom \n... ... ... ... \n21043 WOS:000372583700005 Vrije Univ Amsterdam Netherlands \n1938 WOS:000297611600011 Univ Essex United Kingdom \n64691 WOS:000490430500091 Xian Jiaotong Liverpool Univ China \n25740 WOS:000386793200001 Chinese Acad Sci China \n112682 WOS:000696110800001 Dalian Univ Technol China \n\n Institution_harm \n84810 Univ Birmingham \n122264 Univ Southampton \n135675 UCL \n153134 Kore Univ Enna \n51445 Univ Sheffield \n... ... \n21043 Vrije Univ Amsterdam \n1938 Univ Essex \n64691 Xian Jiaotong Liverpool Univ \n25740 Chinese Acad Sci \n112682 Dalian Univ Technol \n\n[100 rows x 4 columns]", "text/html": "
\n | UT (Unique WOS ID) | \nInstitution | \nCountry | \nInstitution_harm | \n
---|---|---|---|---|
84810 | \nWOS:000571399800004 | \nUniv Birmingham | \nUnited Kingdom | \nUniv Birmingham | \n
122264 | \nWOS:000732918800001 | \nUniv Southampton | \nUnited Kingdom | \nUniv Southampton | \n
135675 | \nWOS:000799234000004 | \nUCL | \nUnited Kingdom | \nUCL | \n
153134 | \nWOS:000900724501058 | \nKore Univ Enna | \nItaly | \nKore Univ Enna | \n
51445 | \nWOS:000455277600005 | \nUniv Sheffield | \nUnited Kingdom | \nUniv Sheffield | \n
... | \n... | \n... | \n... | \n... | \n
21043 | \nWOS:000372583700005 | \nVrije Univ Amsterdam | \nNetherlands | \nVrije Univ Amsterdam | \n
1938 | \nWOS:000297611600011 | \nUniv Essex | \nUnited Kingdom | \nUniv Essex | \n
64691 | \nWOS:000490430500091 | \nXian Jiaotong Liverpool Univ | \nChina | \nXian Jiaotong Liverpool Univ | \n
25740 | \nWOS:000386793200001 | \nChinese Acad Sci | \nChina | \nChinese Acad Sci | \n
112682 | \nWOS:000696110800001 | \nDalian Univ Technol | \nChina | \nDalian Univ Technol | \n
100 rows × 4 columns
\n\n | Country | \nInstitution_harm | \ncount | \n
---|---|---|---|
7736 | \nFrance | \nYncrea Ouest | \n9 | \n
13752 | \nSpain | \nUniv Carlos | \n1 | \n
15855 | \nUnited Kingdom | \nNorthumbria Univ Newcastle Upon Tyne | \n1 | \n
12514 | \nNorway | \nNord Univ | \n1 | \n
602 | \nChina | \nHenan Polytech Univ | \n87 | \n
... | \n... | \n... | \n... | \n
11620 | \nItaly | \nDeep Blue Srl | \n1 | \n
11183 | \nItaly | \nUniv Giustino Fortunato | \n3 | \n
7433 | \nEstonia | \nPlatinum Software Dev Co | \n1 | \n
5129 | \nChina | \nState & Local Joint Engn Lab Estuarine Hydraul Te | \n1 | \n
6799 | \nChina | \nMOA | \n1 | \n
100 rows × 3 columns
\n\n | UT (Unique WOS ID) | \nInstitution | \nCountry | \nInstitution_harm | \nmerge_iter | \n
---|---|---|---|---|---|
244 | \nWOS:000286472300003 | \nUniv Trent | \nItaly | \nUniv Trento | \n1 | \n
364 | \nWOS:000287586100011 | \nUniv Trent | \nItaly | \nUniv Trento | \n1 | \n
410 | \nWOS:000287939200011 | \nAbdus Salam Int Ctr Theoret Phys | \nItaly | \nAbdus Salaam Int Ctr Theoret Phys | \n1 | \n
765 | \nWOS:000290996200002 | \nUniv Trent | \nItaly | \nUniv Trento | \n1 | \n
907 | \nWOS:000291698400013 | \nINFN Sez Roma 1 | \nItaly | \nSez Roma | \n1 | \n
... | \n... | \n... | \n... | \n... | \n... | \n
153063 | \nWOS:000900129900175 | \nUniv Rome Campus Biomed Aquila | \nItaly | \nUniv Rome Campus Biomed LAquila | \n1 | \n
154775 | \nWOS:000929737300001 | \nPrevent & Res Inst | \nItaly | \nPrevent & Res Inst | \n2 | \n
154813 | \nWOS:000929737300001 | \nIst Super Sanit | \nItaly | \nIst Super Sanita | \n1 | \n
154855 | \nWOS:000933331200004 | \nUniv Federio II | \nItaly | \nUniv Federio | \n2 | \n
154857 | \nWOS:000933331200004 | \nINAF Osservatorio Astron Capodimonte | \nItaly | \nOsserv Astron Capodimonte | \n1 | \n
375 rows × 5 columns
\n