{ "cells": [ { "cell_type": "code", "execution_count": 1, "outputs": [], "source": [ "import pandas as pd\n", "# Importing libraries and module and some setting for notebook\n", "\n", "import pandas as pd\n", "import re\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "import numpy as np\n", "from scipy.sparse import csr_matrix\n", "import sparse_dot_topn.sparse_dot_topn as ct #Cosine Similarity\n", "import time\n", "from tqdm import tqdm" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 2, "outputs": [], "source": [ "def wikinorm(univ_string):\n", " from googlesearch import search\n", " from nltk.metrics import edit_distance\n", " from operator import itemgetter\n", " from numpy.random import default_rng\n", " rng = default_rng()\n", " results = search(univ_string, lang=\"en\", num_results=3,advanced=True, sleep_interval=rng.uniform(1, 5))\n", " univ_name = univ_string.split(\",\")[0]\n", " u_results = [i.title for i in results if \"Category:\" not in i.title]\n", " return sorted([tuple((j,edit_distance(univ_name, j))) for j in u_results],key=itemgetter(1))[0][0]\n" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 3, "outputs": [], "source": [ "def replace_uppercase_words(text):\n", " words = text.split()\n", " all_uppercase = all(word.isupper() for word in words)\n", " all_lowercase = all(word.islower() for word in words)\n", " if all_uppercase or all_lowercase:\n", " return text\n", " else:\n", " result = []\n", " for word in words:\n", " w = word.strip()\n", " if not w.isupper() and not w.islower():\n", " result.append(w)\n", " return \" \".join(result).strip()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 4, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO: Pandarallel will run on 4 workers.\n", "INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.\n", "\n", "WARNING: You are on Windows. If you detect any issue with pandarallel, be sure you checked out the Troubleshooting page:\n", "https://nalepae.github.io/pandarallel/troubleshooting/\n" ] }, { "data": { "text/plain": "VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=44660), Label(value='0 / 44660')))…", "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "92c1cd6c14644ffeb042b38f5d5d98c5" } }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "178638\n" ] } ], "source": [ "outdir=\"wos_processed_data\"\n", "univ = pd.read_excel(f\"{outdir}/wos_institution_locations.xlsx\")\n", "\n", "from pandarallel import pandarallel\n", "pandarallel.initialize(progress_bar=True, nb_workers=4)\n", "\n", "univ[\"Institution_harm\"] = univ[\"Institution\"].parallel_apply(replace_uppercase_words)\n", "print(len(univ))" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 5, "outputs": [ { "data": { "text/plain": " UT (Unique WOS ID) Institution \n153271 WOS:000784587900008 Univ Pisa \\\n159800 WOS:000810042500002 China Japan Friendship Hosp \n130931 WOS:000691922800007 Karl Franzens Univ Graz \n1500 WOS:000292944600012 CNR \n113964 WOS:000618210000032 Karolinska Univ Hosp \n... ... ... \n160284 WOS:000812227000009 Univ Appl Sci Upper Austria \n29314 WOS:000381396400013 Univ Southampton \n17045 WOS:000347046200017 Charles Univ Prague \n164118 WOS:000832954200001 Nanjing Univ Aeronaut & Astronaut \n109992 WOS:000604257500070 KTH Royal Inst Technol \n\n Country Institution_harm \n153271 Italy Univ Pisa \n159800 China China Japan Friendship Hosp \n130931 Austria Karl Franzens Univ Graz \n1500 Italy CNR \n113964 Sweden Karolinska Univ Hosp \n... ... ... \n160284 Austria Univ Appl Sci Upper Austria \n29314 United Kingdom Univ Southampton \n17045 Czech Republic Charles Univ Prague \n164118 China Nanjing Univ Aeronaut & Astronaut \n109992 Sweden Royal Inst Technol \n\n[100 rows x 4 columns]", "text/html": "
\n | UT (Unique WOS ID) | \nInstitution | \nCountry | \nInstitution_harm | \n
---|---|---|---|---|
153271 | \nWOS:000784587900008 | \nUniv Pisa | \nItaly | \nUniv Pisa | \n
159800 | \nWOS:000810042500002 | \nChina Japan Friendship Hosp | \nChina | \nChina Japan Friendship Hosp | \n
130931 | \nWOS:000691922800007 | \nKarl Franzens Univ Graz | \nAustria | \nKarl Franzens Univ Graz | \n
1500 | \nWOS:000292944600012 | \nCNR | \nItaly | \nCNR | \n
113964 | \nWOS:000618210000032 | \nKarolinska Univ Hosp | \nSweden | \nKarolinska Univ Hosp | \n
... | \n... | \n... | \n... | \n... | \n
160284 | \nWOS:000812227000009 | \nUniv Appl Sci Upper Austria | \nAustria | \nUniv Appl Sci Upper Austria | \n
29314 | \nWOS:000381396400013 | \nUniv Southampton | \nUnited Kingdom | \nUniv Southampton | \n
17045 | \nWOS:000347046200017 | \nCharles Univ Prague | \nCzech Republic | \nCharles Univ Prague | \n
164118 | \nWOS:000832954200001 | \nNanjing Univ Aeronaut & Astronaut | \nChina | \nNanjing Univ Aeronaut & Astronaut | \n
109992 | \nWOS:000604257500070 | \nKTH Royal Inst Technol | \nSweden | \nRoyal Inst Technol | \n
100 rows × 4 columns
\n\n | Country | \nInstitution_harm | \ncount | \n
---|---|---|---|
8168 | \nCroatia | \nInst Adriat Crops & Karst Reclamat | \n1 | \n
3417 | \nChina | \nCtr Eye & Vis Res | \n1 | \n
1034 | \nChina | \nWestlake Inst Adv Study | \n13 | \n
13427 | \nItaly | \nMacerata Hosp | \n1 | \n
8071 | \nChina | \nKey Lab Ecoind Green Technol Fujian Prov | \n1 | \n
... | \n... | \n... | \n... | \n
17230 | \nUnited Kingdom | \nUniv Kingston | \n6 | \n
8847 | \nFrance | \nUniv Artois | \n8 | \n
16071 | \nSpain | \nCatalonia Geriatr & Gerontol Soc | \n1 | \n
6357 | \nChina | \nWuxi Huace Elect Syst Co Ltd | \n1 | \n
9049 | \nFrance | \nExcelia Business Sch | \n3 | \n
100 rows × 3 columns
\n\n | UT (Unique WOS ID) | \nInstitution | \nCountry | \nInstitution_harm | \nmerge_iter | \n
---|---|---|---|---|---|
49282 | \nWOS:000428099700011 | \nUniv Sheffield | \nUnited Kingdom | \nUniv Sheffield | \n0 | \n
51975 | \nWOS:000432981300002 | \nChinese Acad Sci | \nChina | \nChinese Acad Sci | \n0 | \n
64618 | \nWOS:000459693000011 | \nBabes Bolyai Univ | \nRomania | \nBabes Bolyai Univ | \n0 | \n
163145 | \nWOS:000828102100001 | \nXidian Univ | \nChina | \nXidian Univ | \n0 | \n
99690 | \nWOS:000566510600001 | \nFora Forest Technol | \nSpain | \nFora Forest Technol | \n0 | \n
... | \n... | \n... | \n... | \n... | \n... | \n
1567 | \nWOS:000293492500004 | \nUniv Essex | \nUnited Kingdom | \nUniv Essex | \n0 | \n
73076 | \nWOS:000476471800022 | \nShanghai Univ | \nChina | \nShanghai Univ | \n0 | \n
137096 | \nWOS:000715426400001 | \nQueen Mary Hosp | \nChina | \nQueen Mary Hosp | \n0 | \n
164978 | \nWOS:000836819000003 | \nManchester Metropolitan Univ | \nUnited Kingdom | \nManchester Metropolitan Univ | \n0 | \n
32973 | \nWOS:000390181300013 | \nUniv Complutense Madrid | \nSpain | \nUniv Complutense Madrid | \n0 | \n
500 rows × 5 columns
\n