You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ZSI_Reconnect_China/WOS/wos_processing_pipeline.ipynb

1322 lines
367 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import os\n",
"import shutil\n",
"from flashgeotext.geotext import GeoText\n",
"import re"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"import hashlib\n",
"\n",
"def md5hash(s: str):\n",
" return hashlib.md5(s.encode('utf-8')).hexdigest()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"record_col=\"UT (Unique WOS ID)\"\n",
"outfile = r\"C:\\Users\\radvanyi\\PycharmProjects\\ZSI_analytics\\WOS\\wos_extract\\wos_records_concat.csv\""
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"wos = pd.read_csv(outfile, sep=\"\\t\",low_memory=False)\n",
"\n",
"wos = wos[((wos[\"Publication Year\"]<2023)&(wos[\"Publication Year\"]>2010))].copy()\n",
"print(f'Number of initial (valid interval) records: {len(wos)}')\n",
"\n",
"metrix = pd.read_excel(\"sm_journal_classification.xlsx\", sheet_name=\"Journal_Classification\")\n",
"\n",
"\n",
"metrix = metrix.set_index([c for c in metrix.columns if \"issn\" not in c]).stack().reset_index()\n",
"metrix = metrix.rename(columns={'level_6':\"issn_type\", 0:\"issn\"})\n",
"metrix[\"issn\"]=metrix[\"issn\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
"\n",
"wos[\"issn\"] = wos[\"ISSN\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
"wos[\"eissn\"] = wos[\"eISSN\"].str.replace(\"-\",\"\").str.lower().str.strip()\n",
"wos = wos.set_index([c for c in wos.columns if \"issn\" not in c]).stack().reset_index()\n",
"wos = wos.rename(columns={'level_71':\"issn_var\", 0:\"issn\"})\n",
"\n",
"wos_merge = wos.merge(metrix, on=\"issn\", how=\"left\")\n",
"\n",
"\n",
"\n",
"wos_indexed = wos_merge[~wos_merge[\"Domain_English\"].isna()]\n",
"wos_unindexed = wos_merge[~wos_merge[record_col].isin(wos_indexed[record_col])]\n",
"\n",
"\n",
"wos_unindexed = wos_unindexed.sort_values(by=[\"issn_var\"],ascending=False).drop_duplicates(subset=record_col)\n",
"wos = wos_indexed.sort_values(by=[\"issn_var\"],ascending=False).drop_duplicates(subset=record_col)\n",
"\n",
"wos_postmerge = wos.copy()\n",
"print(f'Number of METRIX filtered records: {len(wos)}')\n",
"print(f'Number of unindexed records: {len(wos_unindexed)}')\n",
"\n",
"# drop entries not indexed by metrix\n",
"# drop duplicates (based on doi)\n",
"wos = wos[~((~wos[\"DOI\"].isna())&(wos[\"DOI\"].duplicated(False)))]\n",
"wos = wos.drop_duplicates(subset=[\"Publication Type\",\"Document Type\",\"Authors\",\"Article Title\",\"Source Title\",\"Publication Year\"])\n",
"print(f'Number of filtered records (dropping duplicates): {len(wos)}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"wos[\"Domain_English\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"wos_classifier = wos[[\"WoS Categories\",\"Research Areas\"]+list(metrix.columns)].copy().drop_duplicates()\n",
"wos_classifier = wos_classifier.groupby([\"WoS Categories\",\"Research Areas\"], as_index=False)[[\"Domain_English\",\"Field_English\",\"SubField_English\"]].agg(\n",
" lambda x: pd.Series.mode(x)[0])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"wos_to_reindex = wos_unindexed.drop(columns=list(metrix.columns))\n",
"wos_found = wos_to_reindex.merge(wos_classifier, on=[\"WoS Categories\",\"Research Areas\"], how=\"inner\")\n",
"# wos_found = wos_to_reindex.merge(wos_classifier, on=\"Research Areas\", how=\"inner\")\n",
"# # wos_found = wos_to_reindex.merge(wos_classifier, on=\"WoS Categories\", how=\"inner\")\n",
"wos_stillost = wos_unindexed[~wos_unindexed[record_col].isin(wos_found[record_col])]\n",
"\n",
"print(\"Found:\", wos_found[record_col].nunique(),\"\\nLost forever:\", wos_stillost[record_col].nunique())"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"wos = pd.concat([wos,wos_found], ignore_index=True)\n",
"print(f'Number of records (after remerge): {len(wos)}')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"wos[\"Domain_English\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"wos_cat = wos.groupby(record_col)[\"WoS Categories\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"wos_cat[\"WoS Categories\"] = wos_cat[\"WoS Categories\"].str.strip()\n",
"wos_cat[\"WoS Categories\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"wos_subcat = wos_cat.copy()\n",
"wos_subcat[['WoS Category', 'WoS SubCategory']] = wos_subcat[\"WoS Categories\"].str.split(\",\", expand = True, n=1)\n",
"for c in ['WoS Category', 'WoS SubCategory',\"WoS Categories\"]:\n",
" wos_subcat[c] = wos_subcat[c].str.strip()\n",
"wos_subcat.drop_duplicates(subset=[record_col,'WoS Category'])[\"WoS Category\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"wos_areas = wos.groupby(record_col)[\"Research Areas\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"wos_areas[\"Research Areas\"] = wos_areas[\"Research Areas\"].str.strip()\n",
"wos_areas[\"Research Areas\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"wos[[\"Article Title\",\"Keywords Plus\",\"Author Keywords\"]].sample(100)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"kw_df = pd.DataFrame()\n",
"for c in [\"Keywords Plus\",\"Author Keywords\"]:\n",
" kwp = wos.groupby(record_col)[c].apply(lambda x: x.str.split(';')).explode().str.strip().str.upper()\n",
" kwp.name = 'keyword_all'\n",
" kw_df = pd.concat([kwp.reset_index(),kw_df],ignore_index=True)\n",
"kw_df = kw_df[~kw_df[\"keyword_all\"].isna()].copy().drop(columns=\"level_1\").drop_duplicates()\n",
"kw_df[\"keyword_all\"] = kw_df[\"keyword_all\"].apply(lambda x: re.sub(\"[\\(\\[].*?[\\)\\]]\", \"\", x))\n",
"kw_df.head(100)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"wos_kwd_concat = kw_df.groupby(record_col, as_index=False).agg({'keyword_all': '; '.join})\n",
"wos_kwd_concat.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"wos.columns"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"geotext = GeoText()\n",
"\n",
"def extract_location(input_text, key='countries'):\n",
" anomalies = {\"Malta\":\"Malta\",\n",
" \"Mongolia\":\"Mongolia\",\n",
" \"Quatar\":\"Qatar\",\n",
" \"Qatar\":\"Qatar\",\n",
" \"Ethiop\":\"Ethiopia\",\n",
" \"Nigeria\":\"Nigeria\",\n",
" \"BELAR\":\"Belarus\",\n",
" \"Venezuela\":\"Venezuela\",\n",
" \"Cyprus\":\"Cyprus\",\n",
" \"Ecuador\":\"Ecuador\",\n",
" \"U Arab\":\"United Arab Emirates\",\n",
" \"Syria\":\"Syria\",\n",
" \"Uganda\":\"Uganda\",\n",
" \"Yemen\":\"Yemen\",\n",
" \"Mali\":\"Mali\",\n",
" \"Senegal\":\"Senegal\",\n",
" \"Vatican\":\"Vatican\",\n",
" \"Uruguay\":\"Uruguay\",\n",
" \"Panama\":\"Panama\",\n",
" \"Fiji\":\"Fiji\",\n",
" \"Faroe\":\"Faroe Islands\",\n",
" \"Macedonia\":\"Macedonia\",\n",
" 'Mozambique':'Mozambique',\n",
" \"Kuwait\":\"Kuwait\",\n",
" \"Libya\":\"Libya\",\n",
" \"Turkiy\":\"Turkey\",\n",
" \"Liberia\":\"Liberia\",\n",
" \"Namibia\":\"Namibia\",\n",
" \"Ivoire\":\"Ivory Coast\",\n",
" \"Guatemala\":\"Gutemala\",\n",
" \"Paraguay\":\"Paraguay\",\n",
" \"Honduras\":\"Honduras\",\n",
" \"Nicaragua\":\"Nicaragua\",\n",
" \"Trinidad\":\"Trinidad & Tobago\",\n",
" \"Liechtenstein\":\"Liechtenstein\",\n",
" \"Greenland\":\"Denmark\"}\n",
"\n",
" extracted = geotext.extract(input_text=input_text)\n",
" found = extracted[key].keys()\n",
" if len(sorted(found))>0:\n",
" return sorted(found)[0]\n",
" elif key=='countries':\n",
" for i in ['Scotland','Wales','England', 'N Ireland']:\n",
" if i in input_text:\n",
" return 'United Kingdom'\n",
" for j in anomalies.keys():\n",
" if j in input_text:\n",
" return anomalies.get(j)\n",
" else:\n",
" return None\n",
"\n",
"with open('../eu_members.txt',\"r\") as f:\n",
" eu_countries=f.readline().split(\",\")\n",
" eu_countries=[i.strip() for i in eu_countries]\n",
"\n",
"def country_cleanup(country):\n",
" if \"USA\" in country:\n",
" return \"USA\"\n",
" elif \"China\" in country:\n",
" return \"China\"\n",
" elif country in [\"England\", \"Northern Ireland\", \"Wales\", \"Scotland\",\"N Ireland\"]:\n",
" return \"United Kingdom\"\n",
" else:\n",
" return country\n",
"\n",
"\n",
"def country_type(country):\n",
" if country in eu_countries:\n",
" return \"EU\"\n",
" elif country==\"China\":\n",
" return \"China\"\n",
" elif country in [\"Switzerland\", 'Norway','United Kingdom']:\n",
" return \"Non-EU associate\"\n",
" else:\n",
" return \"Other\"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"locations = wos.groupby(record_col)[\"Addresses\"].apply(lambda x: x.str.split('[')).explode().reset_index().drop(columns=\"level_1\")\n",
"\n",
"\n",
"locations = locations[locations[\"Addresses\"]!=\"\"].copy()\n",
"locations[\"Address\"] = locations[\"Addresses\"].apply(lambda x:x.split(\"]\")[-1])\n",
"locations[\"Authors_of_address\"] = locations[\"Addresses\"].apply(lambda x:x.split(\"]\")[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"len(locations)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"locations[\"Address\"] = locations[\"Address\"].str.strip().str.strip(\";\")\n",
"locations = locations.groupby([record_col,\"Authors_of_address\"])[\"Address\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_2\")\n",
"locations.head(100)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# import dask.dataframe as dd\n",
"#\n",
"# locations_ddf = dd.from_pandas(locations, npartitions=4) # convert pandas DataFrame to Dask DataFrame\n",
"# loc_compute = locations_ddf.groupby([record_col,\"Authors_of_address\"])[\"Address\"].apply(lambda x: x.str.split(';')).explode().compute() # compute the result"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# locations_test = locations.head(1000)\n",
"# locations_test = locations_test.groupby([record_col,\"Authors_of_address\"])[\"Address\"].str.split(';').explode()\n",
"# locations_test"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"\n",
"# locations[\"Country\"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='countries'))\n",
"locations[\"Country\"]=locations['Address'].apply(lambda x: x.split(\",\")[-1].strip(\" \").strip(\";\").strip(\" \"))\n",
"locations[\"Country\"]=locations['Country'].apply(lambda x: country_cleanup(x))\n",
"locations[\"City\"]=locations['Address'].apply(lambda x: extract_location(input_text=x, key='cities'))\n",
"locations[\"Country_Type\"] = locations[\"Country\"].apply(lambda x: country_type(x))"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"scope_types = [\"EU\",\"China\",\"Non-EU associate\"]\n",
"locations=locations[locations[\"Country_Type\"].isin(scope_types)]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"univ_locations = locations[[record_col,\"Address\",\"Country\",\"City\",\"Country_Type\"]].copy()\n",
"univ_locations[\"Institution\"] = univ_locations[\"Address\"].apply(lambda x: x.split(\",\")[0])\n",
"univ_locations = univ_locations.drop_duplicates()\n",
"univ_locations.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"author_locations = locations.groupby([record_col,\"Country\",\"Country_Type\"])[\"Authors_of_address\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_3\")\n",
"author_locations[\"Author_name\"] = author_locations[\"Authors_of_address\"].str.strip()\n",
"author_locations = author_locations.drop(columns=\"Authors_of_address\")\n",
"author_locations[\"author_str_id\"] = author_locations[\"Author_name\"].apply(lambda x:''.join(filter(str.isalnum, x.lower())))\n",
"author_locations[\"author_str_id\"] = author_locations[\"author_str_id\"].apply(md5hash)\n",
"author_locations = author_locations.drop(columns=\"Author_name\")\n",
"author_locations.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"author_locations[author_locations['author_str_id'].duplicated(False)]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"author_primary_region = author_locations.sort_values(by=\"Country_Type\").drop_duplicates(subset=[record_col,\"author_str_id\"])\n",
"# author_primary_region\n",
"\n",
"china=author_primary_region[author_primary_region[\"Country_Type\"]==\"China\"][record_col].unique()\n",
"eu=author_primary_region[author_primary_region[\"Country_Type\"]==\"EU\"][record_col].unique()\n",
"assoc=author_primary_region[author_primary_region[\"Country_Type\"]==\"Non-EU associate\"][record_col].unique()\n",
"\n",
"\n",
"# records that have distinct authors with different country affiliations\n",
"valid_scope = wos[((wos[record_col].isin(china))\n",
" &\n",
" ((wos[record_col].isin(eu))\n",
" |\n",
" (wos[record_col].isin(assoc))))][record_col].unique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"author_primary_region.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(f'Number of records: {len(wos)}')\n",
"print(f'Number of valid cooperation records: {len(valid_scope)}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"wos = wos[wos[record_col].isin(valid_scope)]\n",
"locations = locations[locations[record_col].isin(valid_scope)]\n",
"univ_locations = univ_locations[univ_locations[record_col].isin(valid_scope)]\n",
"author_locations = author_locations[author_locations[record_col].isin(valid_scope)]\n",
"author_primary_region = author_locations[author_locations[record_col].isin(valid_scope)]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"affiliations = wos.groupby(record_col)[\"Affiliations\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"affiliations[\"Affiliations\"] = affiliations[\"Affiliations\"].str.strip().str.upper().fillna(\"UNKNOWN\")\n",
"affiliations = affiliations.drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"affiliations[\"Affiliations\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"univ_locations[\"Institution\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"univ_locations[record_col].nunique()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"affiliations[record_col].nunique()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"univ_locations[\"Institution\"].value_counts().sum()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"affiliations[\"Affiliations\"].value_counts().sum()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"wos_cat = wos.groupby(record_col)[\"WoS Categories\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"wos_cat[\"WoS Categories\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"wos_areas = wos.groupby(record_col)[\"Research Areas\"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns=\"level_1\")\n",
"wos_areas[\"Research Areas\"] = wos_areas[\"Research Areas\"].str.strip()\n",
"wos_areas[\"Research Areas\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"[c for c in wos.columns if \"_English\" in c]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"metrix_levels = [c for c in wos.columns if \"_English\" in c]\n",
"for m in metrix_levels:\n",
" wos[m] = wos[m].replace({\"article-level classification\":\"Multidisciplinary\"})\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"wos"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"metrix_levels"
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"record_countries = locations[[record_col,\"Country\"]].drop_duplicates()\n",
"record_author_locations = author_locations[[record_col,\"author_str_id\",\"Country\"]].drop_duplicates()\n",
"record_institution = univ_locations[[record_col,\"Institution\",\"Country\"]].drop_duplicates()\n",
"country_types = locations[[\"Country\",\"Country_Type\"]].drop_duplicates()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# Basic network layout"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"country_collabs = record_countries.merge(record_countries, on=record_col)\n",
"country_collabs = country_collabs[country_collabs[\"Country_x\"]!=country_collabs[\"Country_y\"]]\n",
"country_collabs[\"weight\"] = 0.5"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"inst_collabs = record_institution.merge(record_institution, on=record_col)\n",
"inst_collabs = inst_collabs[inst_collabs[\"Institution_x\"]!=inst_collabs[\"Institution_y\"]]\n",
"inst_collabs[\"weight\"] = 0.5"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"wos.columns"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"drop_cols = [ws for ws in wos.columns if ((\"uthor\" in ws or \"ddress\" in ws or \"ORCID\" in\n",
" ws or \"esearcher\" in ws or \"ditor\" in ws or \"name\" in ws or 'SEQ' in ws) and \"eyword\" not in ws)]\n",
"drop_cols"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"outdir=\"wos_processed_data\""
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"os.makedirs(outdir, exist_ok=True)\n",
"\n",
"wos.drop(columns=drop_cols).to_excel(f\"{outdir}/wos_processed.xlsx\", index=False)\n",
"\n",
"record_countries.to_excel(f\"{outdir}/wos_countries.xlsx\", index=False)\n",
"\n",
"record_author_locations.to_excel(f\"{outdir}/wos_author_locations.xlsx\", index=False)\n",
"\n",
"record_institution.to_excel(f\"{outdir}/wos_institution_locations.xlsx\", index=False)\n",
"\n",
"kw_df.to_excel(f\"{outdir}/wos_keywords.xlsx\", index=False)\n",
"\n",
"country_types.to_excel(f\"{outdir}/wos_country_types.xlsx\", index=False)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"wos.drop(columns=drop_cols).to_csv(f\"{outdir}/wos_processed.csv\", index=False, sep='\\t')\n",
"\n",
"record_countries.to_csv(f\"{outdir}/wos_countries.csv\", index=False, sep='\\t')\n",
"\n",
"record_author_locations.to_csv(f\"{outdir}/wos_author_locations.csv\", index=False, sep='\\t')\n",
"\n",
"record_institution.to_csv(f\"{outdir}/wos_institution_locations.csv\", index=False, sep='\\t')\n",
"\n",
"kw_df.to_csv(f\"{outdir}/wos_keywords.csv\", index=False, sep='\\t')\n",
"\n",
"country_types.to_csv(f\"{outdir}/wos_country_types.csv\", index=False, sep='\\t')\n",
"\n",
"inst_collabs.to_csv(f\"{outdir}/wos_inst_collabs.csv\", index=False, sep='\\t')\n",
"\n",
"country_collabs.to_csv(f\"{outdir}/wos_country_collabs.csv\", index=False, sep='\\t')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"wos_areas.to_csv(f\"{outdir}/wos_research_areas.csv\", index=False, sep='\\t')\n",
"\n",
"wos_subcat.to_csv(f\"{outdir}/wos_categories.csv\", index=False, sep='\\t')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"# Simple NLP part"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 1,
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import os\n",
"import shutil\n",
"from flashgeotext.geotext import GeoText\n",
"import re"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [],
"source": [
"import spacy\n",
"\n",
"nlp = spacy.load('en_core_web_trf')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 4,
"outputs": [],
"source": [
"outdir=\"wos_processed_data\"\n",
"record_col=\"UT (Unique WOS ID)\""
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) keyword_all\n0 WOS:000208863600013 COMPARATIVE GENOMICS\n1 WOS:000208863600013 ANAMMOX\n2 WOS:000208863600013 KUENENIA STUTTGARTIENSIS\n3 WOS:000208863600013 METAGENOMICS\n4 WOS:000208863600013 ENRICHMENT CULTURE\n.. ... ...\n95 WOS:000209672000007 SECURITY\n96 WOS:000209672000007 TRUST EVALUATION\n97 WOS:000209672000007 WIRELESS SENSOR NETWORK \n98 WOS:000209673200006 FORMAL VERIFICATION\n99 WOS:000209673200006 STOCHASTIC MODEL CHECKING\n\n[100 rows x 2 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>keyword_all</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000208863600013</td>\n <td>COMPARATIVE GENOMICS</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000208863600013</td>\n <td>ANAMMOX</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000208863600013</td>\n <td>KUENENIA STUTTGARTIENSIS</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000208863600013</td>\n <td>METAGENOMICS</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000208863600013</td>\n <td>ENRICHMENT CULTURE</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>95</th>\n <td>WOS:000209672000007</td>\n <td>SECURITY</td>\n </tr>\n <tr>\n <th>96</th>\n <td>WOS:000209672000007</td>\n <td>TRUST EVALUATION</td>\n </tr>\n <tr>\n <th>97</th>\n <td>WOS:000209672000007</td>\n <td>WIRELESS SENSOR NETWORK</td>\n </tr>\n <tr>\n <th>98</th>\n <td>WOS:000209673200006</td>\n <td>FORMAL VERIFICATION</td>\n </tr>\n <tr>\n <th>99</th>\n <td>WOS:000209673200006</td>\n <td>STOCHASTIC MODEL CHECKING</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 2 columns</p>\n</div>"
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kw_df.head(100)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 44,
"outputs": [],
"source": [
"kw_df = pd.read_excel(f\"{outdir}/wos_keywords.xlsx\")\n",
"wos = pd.read_excel(f\"{outdir}/wos_processed.xlsx\")\n",
"kw_df = kw_df[~kw_df[\"keyword_all\"].isna()].copy()\n",
"wos_kwd_concat = kw_df.groupby(record_col,as_index=False).agg({'keyword_all': '; '.join})"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 43,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) keyword_all\n0 WOS:000208863600013 COMPARATIVE GENOMICS\n1 WOS:000208863600013 ANAMMOX\n2 WOS:000208863600013 KUENENIA STUTTGARTIENSIS\n3 WOS:000208863600013 METAGENOMICS\n4 WOS:000208863600013 ENRICHMENT CULTURE",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>keyword_all</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>WOS:000208863600013</td>\n <td>COMPARATIVE GENOMICS</td>\n </tr>\n <tr>\n <th>1</th>\n <td>WOS:000208863600013</td>\n <td>ANAMMOX</td>\n </tr>\n <tr>\n <th>2</th>\n <td>WOS:000208863600013</td>\n <td>KUENENIA STUTTGARTIENSIS</td>\n </tr>\n <tr>\n <th>3</th>\n <td>WOS:000208863600013</td>\n <td>METAGENOMICS</td>\n </tr>\n <tr>\n <th>4</th>\n <td>WOS:000208863600013</td>\n <td>ENRICHMENT CULTURE</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kw_df.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 45,
"outputs": [],
"source": [
"kwd_nlp = pd.DataFrame(kw_df[\"keyword_all\"].drop_duplicates())\n",
"kwd_nlp = kwd_nlp.rename(columns={\"keyword_all\":\"Document\"})\n",
"kwd_nlp[\"Type\"] = \"kw\"\n",
"kwd_nlp[record_col] = \"kw_\"+(kwd_nlp.index).astype(str)\n",
"wos_nlp = wos.merge(wos_kwd_concat, on=record_col)\n",
"wos_nlp[\"Document\"] = wos_nlp[\"keyword_all\"].fillna(\"\").str.upper()\n",
"# wos_nlp[\"Document\"] = wos_nlp[\"Article Title\"].str.cat(wos_nlp[[\"Abstract\", \"keyword_all\"]].fillna(\"\"), sep=' - ').str.upper()\n",
"# wos_nlp[\"Document\"] = wos_nlp[\"Article Title\"].str.cat(wos_nlp[[\"Abstract\"]].fillna(\"\"), sep=' - ').str.upper()\n",
"wos_nlp[[record_col, \"Document\"]].drop_duplicates()\n",
"wos_nlp[\"Type\"] = \"doc\"\n",
"\n",
"tnse_nlp = pd.concat([kwd_nlp,wos_nlp], ignore_index=True)\n",
"tnse_nlp = tnse_nlp[[record_col,\"Type\",\"Document\",\"keyword_all\"]]\n",
"# tnse_nlp = tnse_nlp.sample(1000)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 47,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Type Document keyword_all\n66311 kw_132167 kw VERTICAL PROGRAMMABILITY NaN\n121641 kw_354170 kw NONLINEAR CLUSTER INVERSION NaN\n35468 kw_59369 kw TIME-VARIANT PARAMETER NaN\n117421 kw_324755 kw MULTI-INCIDENCE NaN\n87947 kw_199369 kw EVERGREEN BROADLEAVED TREES NaN\n... ... ... ... ...\n56273 kw_105016 kw DOUBLE ARC COORDINATE PLOT NaN\n26548 kw_42376 kw MODAL SHIFT NaN\n70903 kw_144947 kw PRIVACY-PERSEVERANCE NaN\n49655 kw_88641 kw IRAP NaN\n104544 kw_254913 kw COGNITIVE-PROCESSES NaN\n\n[100 rows x 4 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Type</th>\n <th>Document</th>\n <th>keyword_all</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>66311</th>\n <td>kw_132167</td>\n <td>kw</td>\n <td>VERTICAL PROGRAMMABILITY</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>121641</th>\n <td>kw_354170</td>\n <td>kw</td>\n <td>NONLINEAR CLUSTER INVERSION</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>35468</th>\n <td>kw_59369</td>\n <td>kw</td>\n <td>TIME-VARIANT PARAMETER</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>117421</th>\n <td>kw_324755</td>\n <td>kw</td>\n <td>MULTI-INCIDENCE</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>87947</th>\n <td>kw_199369</td>\n <td>kw</td>\n <td>EVERGREEN BROADLEAVED TREES</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>56273</th>\n <td>kw_105016</td>\n <td>kw</td>\n <td>DOUBLE ARC COORDINATE PLOT</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>26548</th>\n <td>kw_42376</td>\n <td>kw</td>\n <td>MODAL SHIFT</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>70903</th>\n <td>kw_144947</td>\n <td>kw</td>\n <td>PRIVACY-PERSEVERANCE</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>49655</th>\n <td>kw_88641</td>\n <td>kw</td>\n <td>IRAP</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>104544</th>\n <td>kw_254913</td>\n <td>kw</td>\n <td>COGNITIVE-PROCESSES</td>\n <td>NaN</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 4 columns</p>\n</div>"
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tnse_nlp.sample(100)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"vectors = list()\n",
"vector_norms = list()\n",
"\n",
"for doc in nlp.pipe(tnse_nlp['Document'].astype('unicode').values, batch_size=300,\n",
" n_process=4):\n",
" trf_vector = doc._.trf_data.tensors[-1].mean(axis=0)\n",
" trf_norm = np.linalg.norm(doc._.trf_data.tensors[-1].mean(axis=0))\n",
" norm_vector = trf_vector/trf_norm\n",
" vectors.append(norm_vector)\n",
" vector_norms.append(np.linalg.norm(norm_vector))\n",
"\n",
"tnse_nlp['vector'] = vectors\n",
"tnse_nlp['vector_norm'] = vector_norms\n",
"tnse_nlp['vector_norm'].plot(kind=\"hist\")"
],
"metadata": {
"collapsed": false,
"pycharm": {
"is_executing": true
}
}
},
{
"cell_type": "code",
"execution_count": 32,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) Type \n159915 WOS:000493345400001 doc \\\n62232 kw_120676 kw \n18729 kw_28349 kw \n146728 WOS:000337736000001 doc \n157327 WOS:000793790600002 doc \n... ... ... \n64501 kw_126785 kw \n114208 kw_304857 kw \n90681 kw_207619 kw \n117081 kw_322648 kw \n146051 WOS:000660876800002 doc \n\n Document \n159915 A COOPERATIVE EFFECT-BASED DECISION SUPPORT MO... \\\n62232 URBAN STREET VITALITY \n18729 CONTINUOUS ATTRIBUTE DISCRETISATION \n146728 VENTRICULAR FIBRILLATION AND TACHYCARDIA CLASS... \n157327 MAPPING AND MODELLING DEFECT DATA FROM UAV CAP... \n... ... \n64501 LITTER PRODUCTION \n114208 MIXING-STATE \n90681 SAR-OPTICAL \n117081 INNATE IMMUNE-RESPONSE \n146051 LOW-CYCLE FATIGUE LIFETIME ESTIMATION AND PRED... \n\n keyword_all \n159915 TEAM FORMATION; COOPERATIVE EFFECT; COVERING; ... \\\n62232 NaN \n18729 NaN \n146728 MACHINE LEARNING; PUBLIC DOMAIN ELECTROCARDIOG... \n157327 UNMANNED AERIAL VEHICLE ; BUILDING INFORMATION... \n... ... \n64501 NaN \n114208 NaN \n90681 NaN \n117081 NaN \n146051 GAS TURBINE; LCF; COMPRESSOR; PREDICTIVE MAINT... \n\n vector vector_norm \n159915 [0.037737507, 0.03163352, -0.023620829, -0.019... 1.0 \n62232 [0.05269539, -0.00761333, -0.043163303, -0.023... 1.0 \n18729 [0.048983343, -0.012124105, -0.0497743, -0.024... 1.0 \n146728 [0.041310925, 0.03034619, -0.020368228, -0.021... 1.0 \n157327 [0.04185079, 0.03162047, -0.022166232, -0.0242... 1.0 \n... ... ... \n64501 [0.04933314, 0.0028764526, -0.053359915, -0.03... 1.0 \n114208 [0.04587132, -0.014809725, -0.037412226, -0.02... 1.0 \n90681 [0.049859583, 0.00093559147, -0.040774263, -0.... 1.0 \n117081 [0.04046586, -0.009001592, -0.043696642, -0.02... 1.0 \n146051 [0.038426127, 0.032835256, -0.015592382, -0.02... 1.0 \n\n[100 rows x 6 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>Type</th>\n <th>Document</th>\n <th>keyword_all</th>\n <th>vector</th>\n <th>vector_norm</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>159915</th>\n <td>WOS:000493345400001</td>\n <td>doc</td>\n <td>A COOPERATIVE EFFECT-BASED DECISION SUPPORT MO...</td>\n <td>TEAM FORMATION; COOPERATIVE EFFECT; COVERING; ...</td>\n <td>[0.037737507, 0.03163352, -0.023620829, -0.019...</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>62232</th>\n <td>kw_120676</td>\n <td>kw</td>\n <td>URBAN STREET VITALITY</td>\n <td>NaN</td>\n <td>[0.05269539, -0.00761333, -0.043163303, -0.023...</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>18729</th>\n <td>kw_28349</td>\n <td>kw</td>\n <td>CONTINUOUS ATTRIBUTE DISCRETISATION</td>\n <td>NaN</td>\n <td>[0.048983343, -0.012124105, -0.0497743, -0.024...</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>146728</th>\n <td>WOS:000337736000001</td>\n <td>doc</td>\n <td>VENTRICULAR FIBRILLATION AND TACHYCARDIA CLASS...</td>\n <td>MACHINE LEARNING; PUBLIC DOMAIN ELECTROCARDIOG...</td>\n <td>[0.041310925, 0.03034619, -0.020368228, -0.021...</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>157327</th>\n <td>WOS:000793790600002</td>\n <td>doc</td>\n <td>MAPPING AND MODELLING DEFECT DATA FROM UAV CAP...</td>\n <td>UNMANNED AERIAL VEHICLE ; BUILDING INFORMATION...</td>\n <td>[0.04185079, 0.03162047, -0.022166232, -0.0242...</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>64501</th>\n <td>kw_126785</td>\n <td>kw</td>\n <td>LITTER PRODUCTION</td>\n <td>NaN</td>\n <td>[0.04933314, 0.0028764526, -0.053359915, -0.03...</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>114208</th>\n <td>kw_304857</td>\n <td>kw</td>\n <td>MIXING-STATE</td>\n <td>NaN</td>\n <td>[0.04587132, -0.014809725, -0.037412226, -0.02...</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>90681</th>\n <td>kw_207619</td>\n <td>kw</td>\n <td>SAR-OPTICAL</td>\n <td>NaN</td>\n <td>[0.049859583, 0.00093559147, -0.040774263, -0....</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>117081</th>\n <td>kw_322648</td>\n <td>kw</td>\n <td>INNATE IMMUNE-RESPONSE</td>\n <td>NaN</td>\n <td>[0.04046586, -0.009001592, -0.043696642, -0.02...</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>146051</th>\n <td>WOS:000660876800002</td>\n <td>doc</td>\n <td>LOW-CYCLE FATIGUE LIFETIME ESTIMATION AND PRED...</td>\n <td>GAS TURBINE; LCF; COMPRESSOR; PREDICTIVE MAINT...</td>\n <td>[0.038426127, 0.032835256, -0.015592382, -0.02...</td>\n <td>1.0</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 6 columns</p>\n</div>"
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tnse_nlp.head(100)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 41,
"outputs": [
{
"data": {
"text/plain": " UT (Unique WOS ID) TNSE-X TNSE-Y\n0 kw_0 127.197891 114.109520\n1 kw_1 -21.558281 -202.681183\n2 kw_2 15.277477 -37.555573\n3 kw_3 54.094421 -164.205536\n4 kw_4 -165.029221 -96.129143",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>UT (Unique WOS ID)</th>\n <th>TNSE-X</th>\n <th>TNSE-Y</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>kw_0</td>\n <td>127.197891</td>\n <td>114.109520</td>\n </tr>\n <tr>\n <th>1</th>\n <td>kw_1</td>\n <td>-21.558281</td>\n <td>-202.681183</td>\n </tr>\n <tr>\n <th>2</th>\n <td>kw_2</td>\n <td>15.277477</td>\n <td>-37.555573</td>\n </tr>\n <tr>\n <th>3</th>\n <td>kw_3</td>\n <td>54.094421</td>\n <td>-164.205536</td>\n </tr>\n <tr>\n <th>4</th>\n <td>kw_4</td>\n <td>-165.029221</td>\n <td>-96.129143</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.manifold import TSNE\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"# % matplotlib inline\n",
"\n",
"vector_data = pd.DataFrame(tnse_nlp[\"vector\"].to_list(), index=tnse_nlp[record_col]).reset_index()\n",
"vector_data.head()\n",
"\n",
"labels = vector_data.values[:, 0]\n",
"record_vectors = vector_data.values[:, 1:]\n",
"\n",
"tsne_model = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, random_state=42, metric='cosine')\n",
"tnse_2d = tsne_model.fit_transform(record_vectors)\n",
"tnse_data = pd.DataFrame(tnse_2d, index=labels).reset_index()\n",
"tnse_data.columns = [record_col, \"TNSE-X\", \"TNSE-Y\"]\n",
"tnse_data.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 42,
"outputs": [
{
"data": {
"text/plain": "<matplotlib.legend.Legend at 0x1c5cd3dbd90>"
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAqEAAAGwCAYAAACZwLz9AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd5hcZfnw8e9pc6aX7X032eymNwKhQ2jSQYpi15+9Yu+vXVHE3hAVFUVFRQHpvYb0XjbZzfa+Ozu9nf7+MTEQQUCEEOR8rivXlZ1zduaZZ8+evecp9y04juPgcrlcLpfL5XIdQuJL3QCXy+VyuVwu1yuPG4S6XC6Xy+VyuQ45Nwh1uVwul8vlch1ybhDqcrlcLpfL5Trk3CDU5XK5XC6Xy3XIuUGoy+VyuVwul+uQc4NQl8vlcrlcLtchJ7/UDXC5XC6Xy+V6ObIsC8MwXupmHFYURUGSpOd0rhuEulwul8vlcv0HHMdhfHycVCqFW/LnYIIA0WiU+vp6BEF4xnPdIPRZzMxk3QuM8kVVWRly++MQcPv60HH7+tBx+/rQcPv5iT54MY2Pj5NMpgiFoqiqCjxzsPXK4aBpGslkCoCGhoZnPNsNQp+F4/CK/UV+Om5/HDpuXx86bl8fOm5fHxpuP794LMsilSoHoKFQ5KVuzmHH4/ECkEqlqK2tfcapeXdjksvlcrlcLtdzZBgGjsP+EVDX01FVFcfhWdfLukGoy+VyuVwu13/MnYL/955b37hBqMvlcrlcLpfrkHODUJfL5XK5XC7XIecGoS6Xy+VyuVyvAGNjYxxzzBGMjY291E0B3N3xLpfL5XK5XC8Jy3bYMpJiOqdTHfSwvCmKJL5y1pq6QajL5XK5XC7XIXb/3mmuun8fk1ntwGO1IZVPnjaH0+ZWv4QtO3Tc6XiXy+VyuVyuQ+j+vdN88uZdBwWgAFNZjU/evIv7904fknb85S83cOqpJ3LssSt47LFHDjx+6aUX8tWvfvHA11df/WO+9KXPv+Cv7wahLpfL5XK5XIeIZTtcdf8+nq6WwD8fu+r+fVj2i1tt4IEH7uPqq3/M9773Q0499XQ2b94EwPT0NKOjI2zfvvXAuevXr+OYY457wdvgBqEul8vlcrlch8iWkdRTRkCfzAEmsxpbRlIvWhu2bdvM17/+Zb7+9W+xbNkRrFx5DJs3bwRg69bNrFx5NOPjE8zMzJDJZOju7naDUJfL5XK5XK6Xs+mc/oKe93x861vfQNN0amvrADjmmGPp6ekml8uyZctmVq48ljlz5rBt21Y2btxAR0cHsVjsBW+HG4S6XC6Xy+VyHSLVQc8Let7z8d73foCTT17Fd77zLQBqa+toampm69YtbN26maVLl7F06XK2b9/Kxo3rX5RRUHCDUJfL5XK5XK5DZnlTlNqQ+m8LWwqUd8kvb4q+aG04+eRT+dCHPkJXVxd33HEbUB4Nffjhh5iYGGfevHksW1YOQtetW+MGoS6X64X1vYd6ueK+vS91M1wul+sVRRIFPnnaHOCpFdb/+fUnT5vzoucLra9v4E1vegs/+ckPyeWyHH30sdx11+3Mn78AWVZYunQ5e/Z0kU6nWLRo8YvSBjcIdblegUqmTVa32DNZ4JYdI/xq3dBL3SSXy+V6xThtbjVXvXohNSH1oMdrQipXvXrhIcsT+uY3vw2PR+Gaa67miCNWIAgCS5cuB6CyspLGxiaOOupoZPnFSSsvOI7z4uYAeJmLx7O4PQSCAFVVIbc/DoEXq69N2+br9/ZwzoJqrrhnH8fNjpEsmAQ8IoLjcOnyJuZWB1+4F3wZcK/rQ8ft60PD7ecn+uDFUiqV6O3to6qqDo9HffZveAb/qxWTdF0jHp+gvX02Xq/3357nVkxyuV4hREFgQW2QeFZnTnUQ03KI53TSkojlOPzssUG+e+F8ZPG5T5D8cfMoIa/E+QvqXsSWu1wu1/8mSRQ4suWF33X+cuEGoS7XK8TvNo6wayLLaKrICbNiPNafoinqxe8RmckbNMfU5xyAPtw7w4rGEAOJAj6PxBfv2oNpO6xojuA4ApcuqX+R343L5XK5Xu7cINTleoVojnjJlHSSBYNk0aA6qCCIMJQsUelXaI76n/b7Lv/7Tt54ZBNHt0QByGkmP3tsgOqgh7BPJlk00C0b3bTJlix2T2TpnspxdGuU0zqq2TWR5dfrhvn4KbN4uH+G+qCXWRUBWmO+Q/juXS6Xy3W4cYNQl+sV4rTOao5ujTGR1fjZYwM0RX3smchxYnsFvfECIVV62u9717EttFU8ETCuG0oxvy6ITxEYTZeTKUe8MqOlEoOJAjnNQhAE9kxmERCYKWpMZEr86NEBprIaqizhEeGHF784uy1dLpfL9fLgBqEu1ytIUJWZo8p879ULMUybnG4S83vomsxRG3r6xMiL68MHfT2R1agOelg7kKIl4kW3HWQJfIrE1P7F9WPpEkXdIqjKTGQN2ir9pIsG9SGVqazGCZ2HZueny+VyuQ5fbhDqcr1CKbJITC4HnvNrn3lX/Edv3oUqi/gUiYJuYjoOPkVEd2zGMhqLg2EEdDyyhIBDSJUJ+xV+t26EC5fWM5TQaanw0xrzURdWOfJFTML8Qto4nEKRBJY2RF7qprhcLtf/HDdPqMv1P8SyHdIl4wV/Xo8kMJIqEfCKTOd1BMHBsBwCHglRcLh79ySVIS+KJLBuMIUiA47N0bNj9M/kWFAX5u0rm3nd8kYUGT5zWxeZ4otXF/k/oRsWM4Un+ixRNHigJ8764ST/2DXB7zeNcM3jAwd9z6dv7eJzd+w5xC19YYhmAX/PjVTdeDaBrt+/1M1xuVyvYG4Q6nL9D3m4b4ZP/2P3Ux4fTpV4yx+28Pnb93Dzzgl2j6fYM5l5ynmWbfO+G7fzk8f6AcjrFm/4/SbesKKJqqCHnqkCAhD1KngVkeGERkd1kKXNEUaTBXyyyPLmCCXDYThZIlk0KBg2Z82vpjpYzqf36L4kyYJB0bJf1L54rq64cw8funHHga+v2zDEr9cN8uu1w/RMFzBMm4xmknxS0FwwLbIvQrD/YlNmdlJx25vwDz8EgWp8o6upuvtdKMmul7ppLpfrFcidjne5/oec2FbBwroQmmnzqVt3s7gxzCWL6lg3mMABAqqARxK5Zt0IybzB7CofS+rCbBzN8OUzO+mN51AliYCnvEkp4JH48MntdFb5mVcTQBAFNg2nGM/o5WA0ICMJApphYzmQKZkYjkO2ZDKryo9hWEzkDNorAwfa+NGTZ2M5Dn7lpbv9XHFfDyXTYm5NgA+sauekWdEDxwYTJUJehYBHojbioVA00UyLz96+h/qgh8uOaOC7FyzgK/d08+0HepgpmJw1r5pT5lThOA62w2GbbNrfezP4KiDRiyCrOEYRwSwSWf0VSm2nkVv0rpe6iS6X6xXEDUJdrpcpBwcBga/f28NEtoQiiRzbGuWRvgRvX9lMRdDDLVvHyGoGu8ZyzKsN8o6jm/nbjgk0w6KjJsBQoohpZ5jOanz4pl1ohoUgwJuPbOLmneNsHUnzpTPnMpIq8fhAkld1VtIQ9OBXZWzHZiihM6qV8KsSsiTQGFXpm9FojvrYPpLhHce2MFM4eNpdlZ9+F/5/4qad4zSGvax8HkmeMyWNRFEnW7Lonc7TGPOzfTTDzx4d4IcXLwIcqoIeCrpFMmdQF/GSKhqICKRKFt+8t5d59UEG4wUao16GkkWs/aVpPnzzTgQEvv/qhYiCwOr+BEc2R16Q9/xCUEbWgTcKvkocbxjy0zjhFnAMxOzkS908l8t1iP3ylz9n8+ZNXH31L1+S13en412ul6HueI7zfrGex/oTnLuwBkkQkEUBnyKhmTZd0zl2jGR4zZGNdI3nsR2HkCpyzdpBHu2dweuRkUWBiF+moJl01gTxKgLLmkLMrvIzmCjy3Qd6Gc1oaGZ593tIlZBFkSNbYmwYSqOZAkGviKKIRLwyOAIDCY2YX6azOsAf37yCc+bV8uYjmgHQTAvT/u/qCKZKBltH06zuS7BhOH3QsR8+2s++ePZZn+Mr9+zDMG0kAdoq/RzfUUOuZFATUjEti4X
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"wos_plot = tnse_nlp.merge(tnse_data, on=record_col)\n",
"\n",
"g = sns.scatterplot(wos_plot, x=\"TNSE-X\", y=\"TNSE-Y\",\n",
" hue='Type', s=1)\n",
"g.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\n",
"# wos_plot.head()\n",
"# wos_nlp = wos_plot[[record_col, \"Document\", \"keyword_all\", \"TNSE-X\", \"TNSE-Y\"]]\n"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"wos_nlp = wos.merge(wos_kwd_concat, on=record_col)\n",
"wos_nlp[\"Document\"] = wos_nlp[\"Article Title\"].str.cat(wos_nlp[[\"Abstract\", \"keyword_all\"]].fillna(\"\"), sep=' - ')\n",
"# wos_kwd_test[\"BERT_KWDS\"] = wos_kwd_test[\"Document\"].map(kwd_extract)\n",
"\n",
"vectors = list()\n",
"vector_norms = list()\n",
"\n",
"for doc in nlp.pipe(wos_nlp['Document'].astype('unicode').values, batch_size=300,\n",
" n_process=4):\n",
" vectors.append(doc.vector)\n",
" vector_norms.append(doc.vector_norm)\n",
"\n",
"wos_nlp['vector'] = vectors\n",
"wos_nlp['vector_norm'] = vector_norms\n",
"wos_nlp['vector_norm'].plot(kind=\"hist\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"from sklearn.manifold import TSNE\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"# % matplotlib inline\n",
"\n",
"vector_data = pd.DataFrame(wos_nlp[\"vector\"].to_list(), index=wos_nlp[record_col]).reset_index()\n",
"vector_data.head()\n",
"\n",
"labels = vector_data.values[:, 0]\n",
"record_vectors = vector_data.values[:, 1:]\n",
"\n",
"tsne_model = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, random_state=42, metric='cosine')\n",
"tnse_2d = tsne_model.fit_transform(record_vectors)\n",
"tnse_data = pd.DataFrame(tnse_2d, index=labels).reset_index()\n",
"tnse_data.columns = [record_col, \"TNSE-X\", \"TNSE-Y\"]\n",
"tnse_data.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"wos_plot = wos_nlp.merge(tnse_data, on=record_col)\n",
"\n",
"g = sns.scatterplot(wos_plot[wos_plot[\"Domain_English\"] != 'article-level classification'], x=\"TNSE-X\", y=\"TNSE-Y\",\n",
" hue='Domain_English', s=1)\n",
"g.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\n",
"wos_plot.head()\n",
"wos_nlp = wos_plot[[record_col, \"Document\", \"keyword_all\", \"TNSE-X\", \"TNSE-Y\"]]\n"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"\n",
"wos_nlp.to_excel(f\"{outdir}/wos_nlp.xlsx\", index=False)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"wos_nlp.to_csv(f\"{outdir}/wos_nlp.csv\", index=False, sep='\\t')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"wos_nlp.columns"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"import spacy\n",
"\n",
"nlp = spacy.load(\"en_core_web_lg\")\n",
"kwd_nlp = pd.DataFrame(kw_df[\"keyword_all\"].drop_duplicates())\n",
"# wos_kwd_test[\"BERT_KWDS\"] = wos_kwd_test[\"Document\"].map(kwd_extract)\n",
"\n",
"vectors = list()\n",
"vector_norms = list()\n",
"\n",
"for doc in nlp.pipe(kwd_nlp['keyword_all'].astype('unicode').values, batch_size=300,\n",
" n_process=4):\n",
" vectors.append(doc.vector)\n",
" vector_norms.append(doc.vector_norm)\n",
"\n",
"kwd_nlp['vector'] = vectors\n",
"kwd_nlp['vector_norm'] = vector_norms\n",
"kwd_nlp['vector_norm'].plot(kind=\"hist\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"from sklearn.manifold import TSNE\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"# % matplotlib inline\n",
"\n",
"vector_data = pd.DataFrame(kwd_nlp[\"vector\"].to_list(), index=kwd_nlp[\"keyword_all\"]).reset_index()\n",
"vector_data.head()\n",
"\n",
"labels = vector_data.values[:, 0]\n",
"record_vectors = vector_data.values[:, 1:]\n",
"\n",
"tsne_model = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, random_state=42, metric='cosine')\n",
"tnse_2d = tsne_model.fit_transform(record_vectors)\n",
"tnse_data = pd.DataFrame(tnse_2d, index=labels).reset_index()\n",
"tnse_data.columns = [record_col, \"TNSE-X\", \"TNSE-Y\"]\n",
"tnse_data.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"g = sns.scatterplot(tnse_data, x=\"TNSE-X\", y=\"TNSE-Y\", s=1)\n",
"g.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"wos_nlp.to_csv(f\"{outdir}/wos_nlp.csv\", index=False, sep='\\t')\n",
"tnse_data.to_csv(f\"{outdir}/kw_nlp.csv\", index=False, sep='\\t')\n",
"\n",
"wos_nlp.to_excel(f\"{outdir}/wos_nlp.xlsx\", index=False)\n",
"tnse_data.drop_duplicates(subset=record_col).to_excel(f\"{outdir}/kw_nlp.xlsx\", index=False)"
],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 1
}