From 024dd8de8c1dc0d05cbe3407b49f8b804630421c Mon Sep 17 00:00:00 2001 From: Kriti Mahajan Date: Mon, 29 Jun 2020 21:55:40 +0530 Subject: [PATCH 1/3] Add files via upload --- .../notebooks/Snorkel RE example.ipynb | 2187 +++++++++++++++++ 1 file changed, 2187 insertions(+) create mode 100644 immunology_kg/notebooks/Snorkel RE example.ipynb diff --git a/immunology_kg/notebooks/Snorkel RE example.ipynb b/immunology_kg/notebooks/Snorkel RE example.ipynb new file mode 100644 index 0000000..d44c4a9 --- /dev/null +++ b/immunology_kg/notebooks/Snorkel RE example.ipynb @@ -0,0 +1,2187 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import snorkel\n", + "\n", + "from snorkel.preprocess import preprocessor\n", + "from snorkel.preprocess.nlp import SpacyPreprocessor\n", + "from snorkel.types import DataPoint\n", + "\n", + "from snorkel.labeling.lf.nlp import nlp_labeling_function\n", + "from snorkel.labeling import PandasLFApplier,filter_unlabeled_dataframe,LFAnalysis ,labeling_function\n", + "from snorkel.labeling.model import MajorityClassVoter,MajorityLabelVoter,RandomVoter ,LabelModel\n", + "\n", + "from snorkel.analysis import metric_score , get_label_buckets\n", + "\n", + "from snorkel.utils import probs_to_preds\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "import pandas as pd\n", + "import re\n", + "import os\n", + "from collections import OrderedDict" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Load the data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0textsourcerelationtargetlinkpmc_iddoi_id
00While blocking TPC2 activity by tetrandrine, a...{'(+)-Tetrandrine': {'namespace': 'chebi', 'na...negativeCorrelation{'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...{'annotations': {}, 'citation': {'authors': ['...32221306.0NaN
11Chemoinformatics searches yielded 15 approved ...{'(S)-verapamil': {'namespace': 'chebi', 'name...negativeCorrelation{'hypertension': {'namespace': 'doid', 'name':...{'annotations': {}, 'citation': {'db': 'DOI', ...NaNhttps://doi.org/10.1101/2020.03.22.002386
22Thyroid stimulating hormone and free triiodoth...{\"3,3',5'-triiodothyronine\": {'namespace': 'ch...negativeCorrelation{'COVID-19': {'namespace': 'doid', 'name': 'CO...{'annotations': {'mesh': {'D044967': True}}, '...32217556.0NaN
33Based on these results, we performed virtual d...{\"4'-epidoxorubicin\": {'namespace': 'chebi', '...decreases{'3.4.22.69': {'namespace': 'eccode', 'name': ...{'annotations': {}, 'citation': {'authors': ['...32173287.0NaN
44Doctors can also use a clinically approved bil...{'4-methylumbelliferone': {'namespace': 'chebi...decreases{'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',...{'annotations': {'mesh': {'D008168': True}}, '...32205856.0NaN
55Since Vitamin B3 is highly lung protective, it...{'4-methylumbelliferone': {'namespace': 'chebi...decreases{'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',...{'annotations': {}, 'citation': {'authors': ['...32205856.0NaN
66Doctors can also use a clinically approved bil...{'4-methylumbelliferone': {'namespace': 'chebi...decreases{'inflammatory response': {'namespace': 'go', ...{'annotations': {'mesh': {'D008168': True}}, '...32205856.0NaN
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 text \\\n", + "0 0 While blocking TPC2 activity by tetrandrine, a... \n", + "1 1 Chemoinformatics searches yielded 15 approved ... \n", + "2 2 Thyroid stimulating hormone and free triiodoth... \n", + "3 3 Based on these results, we performed virtual d... \n", + "4 4 Doctors can also use a clinically approved bil... \n", + "5 5 Since Vitamin B3 is highly lung protective, it... \n", + "6 6 Doctors can also use a clinically approved bil... \n", + "\n", + " source relation \\\n", + "0 {'(+)-Tetrandrine': {'namespace': 'chebi', 'na... negativeCorrelation \n", + "1 {'(S)-verapamil': {'namespace': 'chebi', 'name... negativeCorrelation \n", + "2 {\"3,3',5'-triiodothyronine\": {'namespace': 'ch... negativeCorrelation \n", + "3 {\"4'-epidoxorubicin\": {'namespace': 'chebi', '... decreases \n", + "4 {'4-methylumbelliferone': {'namespace': 'chebi... decreases \n", + "5 {'4-methylumbelliferone': {'namespace': 'chebi... decreases \n", + "6 {'4-methylumbelliferone': {'namespace': 'chebi... decreases \n", + "\n", + " target \\\n", + "0 {'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'... \n", + "1 {'hypertension': {'namespace': 'doid', 'name':... \n", + "2 {'COVID-19': {'namespace': 'doid', 'name': 'CO... \n", + "3 {'3.4.22.69': {'namespace': 'eccode', 'name': ... \n", + "4 {'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',... \n", + "5 {'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',... \n", + "6 {'inflammatory response': {'namespace': 'go', ... \n", + "\n", + " link pmc_id \\\n", + "0 {'annotations': {}, 'citation': {'authors': ['... 32221306.0 \n", + "1 {'annotations': {}, 'citation': {'db': 'DOI', ... NaN \n", + "2 {'annotations': {'mesh': {'D044967': True}}, '... 32217556.0 \n", + "3 {'annotations': {}, 'citation': {'authors': ['... 32173287.0 \n", + "4 {'annotations': {'mesh': {'D008168': True}}, '... 32205856.0 \n", + "5 {'annotations': {}, 'citation': {'authors': ['... 32205856.0 \n", + "6 {'annotations': {'mesh': {'D008168': True}}, '... 32205856.0 \n", + "\n", + " doi_id \n", + "0 NaN \n", + "1 https://doi.org/10.1101/2020.03.22.002386 \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "5 NaN \n", + "6 NaN " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#'https://raw.githubusercontent.com/covid19kg/covid19kg/master/supplement/terminology.csv'\n", + "url = 'https://raw.githubusercontent.com/CoronaWhy/task-vt/kaleidoescape_kg/immunology_kg/relations/covid19_frauenhofer_annotations.csv'\n", + "pybel_pd = pd.read_csv(url)\n", + "pybel_pd.head(7)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['negativeCorrelation', 'decreases', 'regulates', 'increases',\n", + " 'positiveCorrelation', 'association', 'isA', 'biomarkerFor',\n", + " 'prognosticBiomarkerFor', 'causesNoChange'], dtype=object)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#list all types of relations\n", + "relation_categories = pybel_pd['relation'].unique()\n", + "relation_categories" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Snorkel Example\n", + "\n", + "For the purpose of this example we'll only focus on rows with 'negativeCorrelation' and 'positiveCorrelation' as their relations." + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\Cafral\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:2: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " \n", + "C:\\Users\\Cafral\\Anaconda3\\lib\\site-packages\\pandas\\core\\frame.py:4102: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " errors=errors,\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textsourcerelationtargetlinkpmc_iddoi_id
0While blocking TPC2 activity by tetrandrine, a...{'(+)-Tetrandrine': {'namespace': 'chebi', 'na...True{'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...{'annotations': {}, 'citation': {'authors': ['...32221306.0NaN
1Chemoinformatics searches yielded 15 approved ...{'(S)-verapamil': {'namespace': 'chebi', 'name...True{'hypertension': {'namespace': 'doid', 'name':...{'annotations': {}, 'citation': {'db': 'DOI', ...NaNhttps://doi.org/10.1101/2020.03.22.002386
2Thyroid stimulating hormone and free triiodoth...{\"3,3',5'-triiodothyronine\": {'namespace': 'ch...True{'COVID-19': {'namespace': 'doid', 'name': 'CO...{'annotations': {'mesh': {'D044967': True}}, '...32217556.0NaN
3The administration of methylprednisolone appea...{'6-methylprednisolone': {'namespace': 'chebi'...True{'Death': {'namespace': 'mesh', 'name': 'Death...{'annotations': {'doid': {'11394': True}}, 'ci...32167524.0NaN
4Adverse reactions of IFN-α mainly include low-...{'Interferon alfa-2a': {'namespace': 'chebi', ...False{'Low-grade fever': {'namespace': 'hp', 'name'...{'annotations': {}, 'citation': {'authors': ['...32166483.0NaN
\n", + "
" + ], + "text/plain": [ + " text \\\n", + "0 While blocking TPC2 activity by tetrandrine, a... \n", + "1 Chemoinformatics searches yielded 15 approved ... \n", + "2 Thyroid stimulating hormone and free triiodoth... \n", + "3 The administration of methylprednisolone appea... \n", + "4 Adverse reactions of IFN-α mainly include low-... \n", + "\n", + " source relation \\\n", + "0 {'(+)-Tetrandrine': {'namespace': 'chebi', 'na... True \n", + "1 {'(S)-verapamil': {'namespace': 'chebi', 'name... True \n", + "2 {\"3,3',5'-triiodothyronine\": {'namespace': 'ch... True \n", + "3 {'6-methylprednisolone': {'namespace': 'chebi'... True \n", + "4 {'Interferon alfa-2a': {'namespace': 'chebi', ... False \n", + "\n", + " target \\\n", + "0 {'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'... \n", + "1 {'hypertension': {'namespace': 'doid', 'name':... \n", + "2 {'COVID-19': {'namespace': 'doid', 'name': 'CO... \n", + "3 {'Death': {'namespace': 'mesh', 'name': 'Death... \n", + "4 {'Low-grade fever': {'namespace': 'hp', 'name'... \n", + "\n", + " link pmc_id \\\n", + "0 {'annotations': {}, 'citation': {'authors': ['... 32221306.0 \n", + "1 {'annotations': {}, 'citation': {'db': 'DOI', ... NaN \n", + "2 {'annotations': {'mesh': {'D044967': True}}, '... 32217556.0 \n", + "3 {'annotations': {'doid': {'11394': True}}, 'ci... 32167524.0 \n", + "4 {'annotations': {}, 'citation': {'authors': ['... 32166483.0 \n", + "\n", + " doi_id \n", + "0 NaN \n", + "1 https://doi.org/10.1101/2020.03.22.002386 \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN " + ] + }, + "execution_count": 131, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "example_data = pybel_pd[(pybel_pd['relation']=='negativeCorrelation') | (pybel_pd['relation']=='positiveCorrelation') ]\n", + "example_data['relation'] = example_data['relation']=='negativeCorrelation'\n", + "example_data.reset_index(inplace=True,drop=True)\n", + "example_data.drop('Unnamed: 0',inplace=True,axis=1)\n", + "example_data.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.1 Split the data into training and testing \n", + "\n", + "Ideally should have training , validation and testing set. Also, here , I'm using a fixed testing period but k-fold cross validation techniques are a more robust way of determining the accuracy of the generated labels." + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "metadata": {}, + "outputs": [], + "source": [ + "df_train,df_test,y_train,y_test = train_test_split(example_data[['text']],example_data[['relation']],test_size=0.20,shuffle=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.2 Reading sentences to understand syntactic differences betweem negative and positive correlation sentences\n", + "\n", + "The utility of snorkel is that it allows you to create multiple labelling functions which (try to) mimic the rules that a human annotator of data would apply while deciding how to label unlabelled data. For instance a human annotator looking to identify negative correlation is sentences will follow the following rules:\n", + "\n", + "##### A. Positive rules: rules which tell you what a sentences with negative correlation looks like\n", + "\n", + "1) does the sentence contain the reduction related words like words 'decreased','reduced','lowered'\n", + "\n", + "2) does the sentence contain the expression 'negatively correlated' or 'negative correlation' or 'negatively related' or 'inversely related' or 'inverse relation'\n", + "\n", + "3) does the syntax follow the structure 'increase in x is associated with a decrease in y' (and vice versa)\n", + "\n", + "4) does the sentence contain the expression 'negative effect'\n", + "\n", + "5) does the sentence contain the expression 'move in opposite directions'\n", + "\n", + "##### B. Negative rules: rules which tell you what a sentences without negative correlation looks like\n", + "\n", + "1) does the sentence contain the increase related words like words 'increased','improved'\n", + "\n", + "2) does the sentence contain the expression 'positively correlated' or 'positive correlation' or 'positively related'\n", + "\n", + "3) does the syntax follow the structure 'increase in x is associated with a increase in y' (and vice versa)\n", + "\n", + "4) does the sentence contain the expression 'positive effect'\n", + "\n", + "5) does the sentence contain the expression 'move in the same direction'\n", + "\n", + "These rules can be coded using snorkel. Importantly it requires both positive and negative rules. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2.2.1 Examining Negative correlation sentences to understand their syntactic structure and then define labelling functions accordingly" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textsourcerelationtargetlinkpmc_iddoi_id
0While blocking TPC2 activity by tetrandrine, a...{'(+)-Tetrandrine': {'namespace': 'chebi', 'na...True{'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...{'annotations': {}, 'citation': {'authors': ['...32221306.0NaN
1Chemoinformatics searches yielded 15 approved ...{'(S)-verapamil': {'namespace': 'chebi', 'name...True{'hypertension': {'namespace': 'doid', 'name':...{'annotations': {}, 'citation': {'db': 'DOI', ...NaNhttps://doi.org/10.1101/2020.03.22.002386
2Thyroid stimulating hormone and free triiodoth...{\"3,3',5'-triiodothyronine\": {'namespace': 'ch...True{'COVID-19': {'namespace': 'doid', 'name': 'CO...{'annotations': {'mesh': {'D044967': True}}, '...32217556.0NaN
3The administration of methylprednisolone appea...{'6-methylprednisolone': {'namespace': 'chebi'...True{'Death': {'namespace': 'mesh', 'name': 'Death...{'annotations': {'doid': {'11394': True}}, 'ci...32167524.0NaN
4In our opinion, during the COVID-19 pandemic, ...{'adrenergic antagonist': {'namespace': 'chebi...True{'COVID-19': {'namespace': 'doid', 'name': 'CO...{'annotations': {}, 'citation': {'authors': ['...32220710.0NaN
5Consistent with previous reports, 20mM NH4Cl a...{'ammonium chloride': {'namespace': 'chebi', '...True{'G protein, vesicular stomatitis virus': {'na...{'annotations': {}, 'citation': {'authors': ['...32221306.0NaN
6If the latter percentage would be found to be ...{'angiotensin receptor antagonist': {'namespac...True{'COVID-19': {'namespace': 'doid', 'name': 'CO...{'annotations': {}, 'citation': {'authors': ['...32129518.0NaN
7Consistent with previous reports, 20mM NH4Cl a...{'bafilomycin A1': {'namespace': 'chebi', 'nam...True{'G protein, vesicular stomatitis virus': {'na...{'annotations': {}, 'citation': {'authors': ['...32221306.0NaN
\n", + "
" + ], + "text/plain": [ + " text \\\n", + "0 While blocking TPC2 activity by tetrandrine, a... \n", + "1 Chemoinformatics searches yielded 15 approved ... \n", + "2 Thyroid stimulating hormone and free triiodoth... \n", + "3 The administration of methylprednisolone appea... \n", + "4 In our opinion, during the COVID-19 pandemic, ... \n", + "5 Consistent with previous reports, 20mM NH4Cl a... \n", + "6 If the latter percentage would be found to be ... \n", + "7 Consistent with previous reports, 20mM NH4Cl a... \n", + "\n", + " source relation \\\n", + "0 {'(+)-Tetrandrine': {'namespace': 'chebi', 'na... True \n", + "1 {'(S)-verapamil': {'namespace': 'chebi', 'name... True \n", + "2 {\"3,3',5'-triiodothyronine\": {'namespace': 'ch... True \n", + "3 {'6-methylprednisolone': {'namespace': 'chebi'... True \n", + "4 {'adrenergic antagonist': {'namespace': 'chebi... True \n", + "5 {'ammonium chloride': {'namespace': 'chebi', '... True \n", + "6 {'angiotensin receptor antagonist': {'namespac... True \n", + "7 {'bafilomycin A1': {'namespace': 'chebi', 'nam... True \n", + "\n", + " target \\\n", + "0 {'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'... \n", + "1 {'hypertension': {'namespace': 'doid', 'name':... \n", + "2 {'COVID-19': {'namespace': 'doid', 'name': 'CO... \n", + "3 {'Death': {'namespace': 'mesh', 'name': 'Death... \n", + "4 {'COVID-19': {'namespace': 'doid', 'name': 'CO... \n", + "5 {'G protein, vesicular stomatitis virus': {'na... \n", + "6 {'COVID-19': {'namespace': 'doid', 'name': 'CO... \n", + "7 {'G protein, vesicular stomatitis virus': {'na... \n", + "\n", + " link pmc_id \\\n", + "0 {'annotations': {}, 'citation': {'authors': ['... 32221306.0 \n", + "1 {'annotations': {}, 'citation': {'db': 'DOI', ... NaN \n", + "2 {'annotations': {'mesh': {'D044967': True}}, '... 32217556.0 \n", + "3 {'annotations': {'doid': {'11394': True}}, 'ci... 32167524.0 \n", + "4 {'annotations': {}, 'citation': {'authors': ['... 32220710.0 \n", + "5 {'annotations': {}, 'citation': {'authors': ['... 32221306.0 \n", + "6 {'annotations': {}, 'citation': {'authors': ['... 32129518.0 \n", + "7 {'annotations': {}, 'citation': {'authors': ['... 32221306.0 \n", + "\n", + " doi_id \n", + "0 NaN \n", + "1 https://doi.org/10.1101/2020.03.22.002386 \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "5 NaN \n", + "6 NaN \n", + "7 NaN " + ] + }, + "execution_count": 133, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "neg_correl_df = example_data[example_data['relation']==1]\n", + "neg_correl_df.reset_index(inplace=True,drop=True)\n", + "neg_correl_df.head(8)" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'While blocking TPC2 activity by tetrandrine, an inhibitor for TPC237, decreased entry of SARS-CoV-2 S pseudovirions (Fig. 3f), treatment of cells with 130, a TRPML1 inhibitor, had no effect (Supplementary Fig. 1), indicating that TPC2, not TRPML1, is important for SARS-CoV-2 entry.'" + ] + }, + "execution_count": 134, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "neg_correl_df['text'][0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2.2.2 Examining positive correlation sentences to understand their syntactic structure and then define labelling funtions accordingly" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textsourcerelationtargetlinkpmc_iddoi_id
0Adverse reactions of IFN-α mainly include low-...{'Interferon alfa-2a': {'namespace': 'chebi', ...False{'Low-grade fever': {'namespace': 'hp', 'name'...{'annotations': {}, 'citation': {'authors': ['...32166483.0NaN
1Adverse reactions of IFN-α mainly include low-...{'Interferon alfa-2a': {'namespace': 'chebi', ...False{'influenza': {'namespace': 'doid', 'name': 'i...{'annotations': {}, 'citation': {'authors': ['...32166483.0NaN
2This may be accounted for by two complementary...{'angiotensin II': {'namespace': 'chebi', 'nam...False{'COVID-19': {'namespace': 'doid', 'name': 'CO...{'annotations': {}, 'citation': {'authors': ['...32129518.0NaN
3ACE2 can also antagonize cardiac fibrosis and ...{'angiotensin II': {'namespace': 'chebi', 'nam...False{'Ventricular Remodeling': {'namespace': 'mesh...{'annotations': {}, 'citation': {'authors': ['...32221983.0NaN
4ACE2 can also antagonize cardiac fibrosis and ...{'angiotensin II': {'namespace': 'chebi', 'nam...False{'Myocardial fibrosis': {'namespace': 'hp', 'n...{'annotations': {}, 'citation': {'authors': ['...32221983.0NaN
5The existence of significantly increased fibri...{'Fibrin': {'namespace': 'chebi', 'name': 'Fib...False{'Hyperfibrinolysis': {'namespace': 'hp', 'nam...{'annotations': {}, 'citation': {'authors': ['...32216698.0NaN
6This opinion is supported by the presence of h...{'Fibrin': {'namespace': 'chebi', 'name': 'Fib...False{'Hemorrhage': {'namespace': 'mesh', 'name': '...{'annotations': {}, 'citation': {'authors': ['...32216698.0NaN
7In the influenza virus model, it was reported ...{'chloroquine': {'namespace': 'chebi', 'name':...False{'dendritic cell antigen processing and presen...{'annotations': {'mesh': {'D007251': True}}, '...32171740.0NaN
\n", + "
" + ], + "text/plain": [ + " text \\\n", + "0 Adverse reactions of IFN-α mainly include low-... \n", + "1 Adverse reactions of IFN-α mainly include low-... \n", + "2 This may be accounted for by two complementary... \n", + "3 ACE2 can also antagonize cardiac fibrosis and ... \n", + "4 ACE2 can also antagonize cardiac fibrosis and ... \n", + "5 The existence of significantly increased fibri... \n", + "6 This opinion is supported by the presence of h... \n", + "7 In the influenza virus model, it was reported ... \n", + "\n", + " source relation \\\n", + "0 {'Interferon alfa-2a': {'namespace': 'chebi', ... False \n", + "1 {'Interferon alfa-2a': {'namespace': 'chebi', ... False \n", + "2 {'angiotensin II': {'namespace': 'chebi', 'nam... False \n", + "3 {'angiotensin II': {'namespace': 'chebi', 'nam... False \n", + "4 {'angiotensin II': {'namespace': 'chebi', 'nam... False \n", + "5 {'Fibrin': {'namespace': 'chebi', 'name': 'Fib... False \n", + "6 {'Fibrin': {'namespace': 'chebi', 'name': 'Fib... False \n", + "7 {'chloroquine': {'namespace': 'chebi', 'name':... False \n", + "\n", + " target \\\n", + "0 {'Low-grade fever': {'namespace': 'hp', 'name'... \n", + "1 {'influenza': {'namespace': 'doid', 'name': 'i... \n", + "2 {'COVID-19': {'namespace': 'doid', 'name': 'CO... \n", + "3 {'Ventricular Remodeling': {'namespace': 'mesh... \n", + "4 {'Myocardial fibrosis': {'namespace': 'hp', 'n... \n", + "5 {'Hyperfibrinolysis': {'namespace': 'hp', 'nam... \n", + "6 {'Hemorrhage': {'namespace': 'mesh', 'name': '... \n", + "7 {'dendritic cell antigen processing and presen... \n", + "\n", + " link pmc_id doi_id \n", + "0 {'annotations': {}, 'citation': {'authors': ['... 32166483.0 NaN \n", + "1 {'annotations': {}, 'citation': {'authors': ['... 32166483.0 NaN \n", + "2 {'annotations': {}, 'citation': {'authors': ['... 32129518.0 NaN \n", + "3 {'annotations': {}, 'citation': {'authors': ['... 32221983.0 NaN \n", + "4 {'annotations': {}, 'citation': {'authors': ['... 32221983.0 NaN \n", + "5 {'annotations': {}, 'citation': {'authors': ['... 32216698.0 NaN \n", + "6 {'annotations': {}, 'citation': {'authors': ['... 32216698.0 NaN \n", + "7 {'annotations': {'mesh': {'D007251': True}}, '... 32171740.0 NaN " + ] + }, + "execution_count": 135, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "positive_relation_df = example_data[example_data['relation']==0]\n", + "positive_relation_df.reset_index(inplace=True,drop=True)\n", + "positive_relation_df.head(8)" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Adverse reactions of IFN-α mainly include low-grade fever and flu-like symptoms (both in children with intramuscularly injection) [11].'" + ] + }, + "execution_count": 136, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "positive_relation_df['text'][0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.3 Source-Target dictionary" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A simple but clean rule for identifying negative correlation sentences would be if negative tokens occured in the words between the source and the target. So, a source-target dictonary is created for some of the examples (in the final pipeline the source target dictonary will be obtained from the spacy pipeline)." + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('tetrandrine', 'TPC2'),\n", + " ('triiodothyronine', 'recovered'),\n", + " ('methylprednisolone', 'death'),\n", + " ('IFN-α', 'fever'),\n", + " ('angiotensin', 'vasodilator'),\n", + " ('ACE2', 'Ang'),\n", + " ('fibrin', 'COVID-19'),\n", + " ('hemorrhage', 'fibrinolysis'),\n", + " ('chloroquine', 'dendritic')]" + ] + }, + "execution_count": 137, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "example_sources = ['tetrandrine','triiodothyronine','methylprednisolone','IFN-α','angiotensin','ACE2','fibrin','hemorrhage','chloroquine']\n", + "\n", + "example_targets = ['TPC2','recovered','death','fever','vasodilator','Ang','COVID-19','fibrinolysis','dendritic'] #low-grade fever\n", + "\n", + "example_source_target_dict = list(OrderedDict.fromkeys(zip(example_sources,example_targets)))\n", + "example_source_target_dict" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.4 Labeling functions for RE\n", + "\n", + "#### 2.4.1 Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": {}, + "outputs": [], + "source": [ + "spacy = SpacyPreprocessor(text_field=\"text\", doc_field=\"doc\", memoize=True)\n", + "\n", + "ABSTAIN = -1\n", + "NOT_FOUND = 0\n", + "FOUND = 1" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Source target pair: ('tetrandrine', 'TPC2')\n" + ] + } + ], + "source": [ + "@preprocessor(pre=[spacy])\n", + "def get_source_target(cand: DataPoint) -> DataPoint:\n", + " \"\"\"\n", + " Returns the source and target mentioned in the sentence\n", + " \"\"\"\n", + " person_names = []\n", + "\n", + " source = [token.text for token in cand.doc if token.text in example_sources]\n", + " target = [token.text for token in cand.doc if token.text in example_targets]\n", + " \n", + " try:\n", + " cand.source_target = (source[0],target[0])\n", + " except:\n", + " cand.source_target = (np.nan,np.nan)\n", + " return cand\n", + "\n", + "########### function example #####################\n", + "\n", + "candidate = example_data.loc[0]\n", + "candidate_with_function_applied = get_source_target(candidate) \n", + "\n", + "print(\"Source target pair: \", candidate_with_function_applied.source_target)" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sentence: Thyroid stimulating hormone and free triiodothyronine concentrations were significantly lower in deceased patients (0.7 mIU/mL and 2.8 pmol/L) than in recovered patients (1.4 mIU/mL and 4.3 pmol/L).\n", + "Text Between: triiodothyronine concentrations were significantly lower in deceased patients (0.7 mIU/mL and 2.8 pmol/L) than in\n", + "Text Between: Thyroid stimulating hormone and free\n" + ] + } + ], + "source": [ + "@preprocessor(pre=[spacy])\n", + "def get_text_between(cand: DataPoint) -> DataPoint:\n", + " \"\"\"\n", + " Returns the text between a source target pair\n", + " \"\"\"\n", + " person_names = []\n", + "\n", + " source_idx = [token.i for token in cand.doc if token.text in example_sources]\n", + " target_idx = [token.i for token in cand.doc if token.text in example_targets]\n", + " \n", + " try:\n", + "\n", + " if (len(target_idx)==1) & (len(source_idx)==1) & (source_idx[0]1) & (len(source_idx)==1):\n", + " for target_index in target_idx:\n", + " if source_idx[0]1) & (len(target_idx)==1):\n", + " for source_index in source_idx:\n", + " if source_index1) & (len(target_idx)>1):\n", + " for source_index in source_idx:\n", + " for target_index in target_idx:\n", + " if source_index 0 else ABSTAIN\n", + "\n", + "#positive rule - version 2\n", + "@labeling_function(pre=[spacy,get_text_between],resources=dict(reduction_tokens=reduction_tokens))\n", + "def contains_reduction_tokens_text_between(x,reduction_tokens):\n", + " relation_text = x.text_between\n", + " relation_text_tokens = [str(token) for token in relation_text]\n", + " return FOUND if len(reduction_tokens.intersection(set(relation_text_tokens))) > 0 else ABSTAIN\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2) does the sentence contain the expression 'negatively correlated' or 'negative correlation' or 'negatively related' or 'inversely related' or 'inverse relation'" + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "metadata": {}, + "outputs": [], + "source": [ + "negative_correlation_regex_1 = 'negative correlation'\n", + "negative_correlation_regex_2 = 'negatively correlated'\n", + "negative_correlation_regex_3 = 'negatively related'\n", + "negative_correlation_regex_4 = 'inversely related'\n", + "negative_correlation_regex_5 = 'inverse relation'\n", + "negative_correlation_regex_6 = 'negative effect'\n", + "negative_correlation_regex_7 = 'move in opposite directions'\n", + "\n", + "@labeling_function()\n", + "def contains_negative_corrrelation_regex(x):\n", + " if re.search(negative_correlation_regex_1, x.text, flags=re.I):\n", + " return FOUND\n", + " elif re.search(negative_correlation_regex_2, x.text, flags=re.I):\n", + " return FOUND\n", + " elif re.search(negative_correlation_regex_3, x.text, flags=re.I):\n", + " return FOUND\n", + " elif re.search(negative_correlation_regex_4, x.text, flags=re.I):\n", + " return FOUND\n", + " elif re.search(negative_correlation_regex_5, x.text, flags=re.I):\n", + " return FOUND\n", + " elif re.search(negative_correlation_regex_6, x.text, flags=re.I):\n", + " return FOUND\n", + " elif re.search(negative_correlation_regex_7, x.text, flags=re.I):\n", + " return FOUND\n", + "\n", + " else: \n", + " return ABSTAIN\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3) does the syntax follow the structure 'increase in x is associated with a decrease in y' (and vice versa)" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "metadata": {}, + "outputs": [], + "source": [ + "@labeling_function(pre=[spacy,get_text_between])\n", + "def contains_increase_decrease_pattern(x):\n", + " if ('increase' in [str(token) for token in x.text_to_source_left]) & ('decrease' in [str(token) for token in x.text_between]):\n", + " return FOUND\n", + " elif ('decrease' in [str(token) for token in x.text_to_source_left]) & ('increase' in [str(token) for token in x.text_between]):\n", + " return FOUND\n", + " else:\n", + " return ABSTAIN" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### B. Negative rules: rules which tell you what a sentences without negative correlation looks like\n", + "\n", + "1) does the sentence contain the increase related words like words 'increased','higher'" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": {}, + "outputs": [], + "source": [ + "increase_tokens = {'increased',\n", + " 'higher',\n", + "}\n", + "\n", + "@labeling_function(pre=[spacy],resources=dict(increase_tokens=increase_tokens))\n", + "def contains_increase_tokens(x,increase_tokens):\n", + " tokens = [str(token) for token in x.doc]\n", + " return NOT_FOUND if len(increase_tokens.intersection(set(tokens))) > 0 else ABSTAIN\n", + "\n", + "\n", + "@labeling_function(pre=[spacy,get_text_between],resources=dict(increase_tokens=increase_tokens))\n", + "def contains_increase_tokens_text_between(x, increase_tokens):\n", + " relation_text = x.text_between\n", + " relation_text_tokens = [str(token) for token in relation_text]\n", + " return NOT_FOUND if len(increase_tokens.intersection(set(relation_text_tokens))) > 0 else ABSTAIN" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2) does the sentence contain the expression 'positively correlated' or 'positive correlation' or 'positively related' or 'positive effect' or 'move in the same direction'" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "metadata": {}, + "outputs": [], + "source": [ + "#regex\n", + "positive_correlation_regex_1 = 'positive correlation'\n", + "positive_correlation_regex_2 = 'positively correlated'\n", + "positive_correlation_regex_3 = 'positively related'\n", + "positive_correlation_regex_4 = 'positive effect'\n", + "positive_correlation_regex_5 = 'move in the same direction'\n", + "\n", + "@labeling_function()\n", + "def contains_positive_corrrelation_regex(x):\n", + " if re.search(positive_correlation_regex_1, x.text, flags=re.I):\n", + " return FOUND\n", + " elif re.search(positive_correlation_regex_2, x.text, flags=re.I):\n", + " return FOUND\n", + " elif re.search(positive_correlation_regex_3, x.text, flags=re.I):\n", + " return FOUND\n", + " elif re.search(positive_correlation_regex_4, x.text, flags=re.I):\n", + " return FOUND\n", + " elif re.search(positive_correlation_regex_5, x.text, flags=re.I):\n", + " return FOUND \n", + " else:\n", + " return ABSTAIN" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3) does the syntax follow the structure 'increase in x is associated with a increase in y' (and vice versa)" + ] + }, + { + "cell_type": "code", + "execution_count": 146, + "metadata": {}, + "outputs": [], + "source": [ + "@labeling_function(pre=[spacy,get_text_between])\n", + "def contains_increase_increase_pattern(x):\n", + " if ('increase' in [str(token) for token in x.text_to_source_left]) & ('increase' in [str(token) for token in x.text_between]):\n", + " return FOUND\n", + " elif ('decrease' in [str(token) for token in x.text_to_source_left]) & ('decrease' in [str(token) for token in x.text_between]):\n", + " return FOUND\n", + " else:\n", + " return ABSTAIN" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.5 Creating all the labels for the different rules" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\Cafral\\Anaconda3\\lib\\site-packages\\tqdm\\std.py:658: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n", + " from pandas import Panel\n", + "\n", + " 0%| | 0/2227 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
jPolarityCoverageOverlapsConflicts
contains_reduction_tokens0[1]0.0570270.0193080.016165
contains_reduction_tokens_text_between1[1]0.0040410.0040410.000898
contains_negative_corrrelation_regex2[1]0.0085320.0000000.000000
contains_increase_decrease_pattern3[]0.0000000.0000000.000000
contains_increase_tokens4[0]0.1302200.0161650.016165
contains_increase_tokens_text_between5[]0.0000000.0000000.000000
contains_positive_corrrelation_regex6[1]0.0022450.0000000.000000
contains_increase_increase_pattern7[]0.0000000.0000000.000000
\n", + "" + ], + "text/plain": [ + " j Polarity Coverage Overlaps \\\n", + "contains_reduction_tokens 0 [1] 0.057027 0.019308 \n", + "contains_reduction_tokens_text_between 1 [1] 0.004041 0.004041 \n", + "contains_negative_corrrelation_regex 2 [1] 0.008532 0.000000 \n", + "contains_increase_decrease_pattern 3 [] 0.000000 0.000000 \n", + "contains_increase_tokens 4 [0] 0.130220 0.016165 \n", + "contains_increase_tokens_text_between 5 [] 0.000000 0.000000 \n", + "contains_positive_corrrelation_regex 6 [1] 0.002245 0.000000 \n", + "contains_increase_increase_pattern 7 [] 0.000000 0.000000 \n", + "\n", + " Conflicts \n", + "contains_reduction_tokens 0.016165 \n", + "contains_reduction_tokens_text_between 0.000898 \n", + "contains_negative_corrrelation_regex 0.000000 \n", + "contains_increase_decrease_pattern 0.000000 \n", + "contains_increase_tokens 0.016165 \n", + "contains_increase_tokens_text_between 0.000000 \n", + "contains_positive_corrrelation_regex 0.000000 \n", + "contains_increase_increase_pattern 0.000000 " + ] + }, + "execution_count": 148, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#in the absence of a benchmark to compare against\n", + "LFAnalysis(L=label_matrix_train, lfs=label_functions_list).lf_summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
jPolarityCoverageOverlapsConflictsCorrectIncorrectEmp. Acc.
contains_reduction_tokens0[1]0.0570270.0193080.01616569580.543307
contains_reduction_tokens_text_between1[1]0.0040410.0040410.000898810.888889
contains_negative_corrrelation_regex2[1]0.0085320.0000000.0000001810.947368
contains_increase_decrease_pattern3[]0.0000000.0000000.000000000.000000
contains_increase_tokens4[0]0.1302200.0161650.016165263270.906897
contains_increase_tokens_text_between5[]0.0000000.0000000.000000000.000000
contains_positive_corrrelation_regex6[1]0.0022450.0000000.000000050.000000
contains_increase_increase_pattern7[]0.0000000.0000000.000000000.000000
\n", + "
" + ], + "text/plain": [ + " j Polarity Coverage Overlaps \\\n", + "contains_reduction_tokens 0 [1] 0.057027 0.019308 \n", + "contains_reduction_tokens_text_between 1 [1] 0.004041 0.004041 \n", + "contains_negative_corrrelation_regex 2 [1] 0.008532 0.000000 \n", + "contains_increase_decrease_pattern 3 [] 0.000000 0.000000 \n", + "contains_increase_tokens 4 [0] 0.130220 0.016165 \n", + "contains_increase_tokens_text_between 5 [] 0.000000 0.000000 \n", + "contains_positive_corrrelation_regex 6 [1] 0.002245 0.000000 \n", + "contains_increase_increase_pattern 7 [] 0.000000 0.000000 \n", + "\n", + " Conflicts Correct Incorrect \\\n", + "contains_reduction_tokens 0.016165 69 58 \n", + "contains_reduction_tokens_text_between 0.000898 8 1 \n", + "contains_negative_corrrelation_regex 0.000000 18 1 \n", + "contains_increase_decrease_pattern 0.000000 0 0 \n", + "contains_increase_tokens 0.016165 263 27 \n", + "contains_increase_tokens_text_between 0.000000 0 0 \n", + "contains_positive_corrrelation_regex 0.000000 0 5 \n", + "contains_increase_increase_pattern 0.000000 0 0 \n", + "\n", + " Emp. Acc. \n", + "contains_reduction_tokens 0.543307 \n", + "contains_reduction_tokens_text_between 0.888889 \n", + "contains_negative_corrrelation_regex 0.947368 \n", + "contains_increase_decrease_pattern 0.000000 \n", + "contains_increase_tokens 0.906897 \n", + "contains_increase_tokens_text_between 0.000000 \n", + "contains_positive_corrrelation_regex 0.000000 \n", + "contains_increase_increase_pattern 0.000000 " + ] + }, + "execution_count": 149, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#examining the quality of the labels in the presence of a benchmark to compare against\n", + "LFAnalysis(L=label_matrix_train, lfs=label_functions_list).lf_summary(y_train.values)" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
text
0While blocking TPC2 activity by tetrandrine, a...
2Thyroid stimulating hormone and free triiodoth...
3The administration of methylprednisolone appea...
7Consistent with previous reports, 20mM NH4Cl a...
12Consistent with previous reports, 20mM NH4Cl a...
......
1644Actual bicarbonate and total carbon dioxide co...
1655Albumin concentrations were significantly lowe...
1657Moreover, the frequencies of regulatory T cell...
1658The reduced expressions of interferon-γ (IFN-γ...
1668Spleen atrophy was observed in all reported ca...
\n", + "

127 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " text\n", + "0 While blocking TPC2 activity by tetrandrine, a...\n", + "2 Thyroid stimulating hormone and free triiodoth...\n", + "3 The administration of methylprednisolone appea...\n", + "7 Consistent with previous reports, 20mM NH4Cl a...\n", + "12 Consistent with previous reports, 20mM NH4Cl a...\n", + "... ...\n", + "1644 Actual bicarbonate and total carbon dioxide co...\n", + "1655 Albumin concentrations were significantly lowe...\n", + "1657 Moreover, the frequencies of regulatory T cell...\n", + "1658 The reduced expressions of interferon-γ (IFN-γ...\n", + "1668 Spleen atrophy was observed in all reported ca...\n", + "\n", + "[127 rows x 1 columns]" + ] + }, + "execution_count": 150, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#examine which sentences were picked up as showing negative correlation by each label function\n", + "df_train.iloc[label_matrix_train[:, 0] == FOUND]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.7 Predict the final label\n", + "\n", + "Different models can be used to create the final model that aggrateges the different label functions to perdict the final lebel." + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\Cafral\\Anaconda3\\lib\\site-packages\\tqdm\\std.py:658: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n", + " from pandas import Panel\n", + "\n", + " 0%| | 0/557 [00:00 0.5635383924562192 \n", + " test-> 0.4703770197486535 \n", + " AUC: \n", + " train-> 0.6010212548732079 \n", + " test-> 0.5227059436913452 \n", + "\n", + "Label Model \n", + " Accuracy: \n", + " train-> 0.5527615626403233 \n", + " test-> 0.49012567324955114 \n", + " AUC: \n", + " train-> 0.524345344386498 \n", + " test-> 0.4437434827945777 \n", + "\n", + "Random Voter Model \n", + " Accuracy: \n", + " train-> 0.5024696901661428 \n", + " test-> 0.5008976660682226 \n", + " AUC: \n", + " train-> 0.5126309212678816 \n", + " test-> 0.5023114355231144 \n", + "\n" + ] + } + ], + "source": [ + "all_models = {'Majority Model':majority_model,\n", + " 'Label Model':label_model,\n", + " 'Random Voter Model':random_voter}\n", + "\n", + "for model_name,model in all_models.items():\n", + " \n", + " #accuracy\n", + " train_acc = model.score(L=label_matrix_train, Y=y_train, tie_break_policy=\"random\")[\"accuracy\"]\n", + " test_acc = model.score(L=label_matrix_test, Y=y_test, tie_break_policy=\"random\")[\"accuracy\"]\n", + " \n", + " #auc\n", + " train_auc = metric_score(y_train, probs=model.predict_proba(L=label_matrix_train), metric='roc_auc')\n", + " test_auc = metric_score(y_test, probs=model.predict_proba(L=label_matrix_test), metric='roc_auc')\n", + " \n", + " print(f'{model_name}','\\n',\n", + " 'Accuracy:','\\n','train->',train_acc,'\\n','test->',test_acc,'\\n',\n", + " 'AUC:','\\n','train->',train_auc,'\\n','test->',test_auc,'\\n')\n", + " \n", + " \n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The label model has the highest test AUC so that's the best model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Filter out unlabeled points" + ] + }, + { + "cell_type": "code", + "execution_count": 169, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total points labelled in training data: 405\n", + "Total points labelled in testing data: 91\n" + ] + } + ], + "source": [ + "#training labels\n", + "df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(\n", + " X=df_train['text'], \n", + " y=lm_preds_proba_train, \n", + " L=label_matrix_train\n", + ")\n", + "\n", + "#testing labels\n", + "df_test_filtered, probs_test_filtered = filter_unlabeled_dataframe(\n", + " X=df_test['text'], \n", + " y=lm_preds_proba_test, \n", + " L=label_matrix_test\n", + ")\n", + "\n", + "print('Total points labelled in training data:',len(df_train_filtered))\n", + "print('Total points labelled in testing data:',len(df_test_filtered))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# References\n", + "\n", + "https://www.snorkel.org/use-cases/spouse-demo\n", + " \n", + "https://github.com/snorkel-team/snorkel-tutorials/blob/master/spouse/spouse_demo.ipynb\n", + " \n", + "https://www.snorkel.org/use-cases/01-spam-tutorial\n", + " \n", + "https://readthedocs.org/projects/snorkel/downloads/pdf/master/" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 68a1ce1d76c6cd16d88c64166d482742f7d81377 Mon Sep 17 00:00:00 2001 From: Kriti Mahajan Date: Wed, 1 Jul 2020 20:04:17 +0530 Subject: [PATCH 2/3] Delete Snorkel RE example.ipynb --- .../notebooks/Snorkel RE example.ipynb | 2187 ----------------- 1 file changed, 2187 deletions(-) delete mode 100644 immunology_kg/notebooks/Snorkel RE example.ipynb diff --git a/immunology_kg/notebooks/Snorkel RE example.ipynb b/immunology_kg/notebooks/Snorkel RE example.ipynb deleted file mode 100644 index d44c4a9..0000000 --- a/immunology_kg/notebooks/Snorkel RE example.ipynb +++ /dev/null @@ -1,2187 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import snorkel\n", - "\n", - "from snorkel.preprocess import preprocessor\n", - "from snorkel.preprocess.nlp import SpacyPreprocessor\n", - "from snorkel.types import DataPoint\n", - "\n", - "from snorkel.labeling.lf.nlp import nlp_labeling_function\n", - "from snorkel.labeling import PandasLFApplier,filter_unlabeled_dataframe,LFAnalysis ,labeling_function\n", - "from snorkel.labeling.model import MajorityClassVoter,MajorityLabelVoter,RandomVoter ,LabelModel\n", - "\n", - "from snorkel.analysis import metric_score , get_label_buckets\n", - "\n", - "from snorkel.utils import probs_to_preds\n", - "\n", - "from sklearn.model_selection import train_test_split\n", - "\n", - "import pandas as pd\n", - "import re\n", - "import os\n", - "from collections import OrderedDict" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 1. Load the data" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Unnamed: 0textsourcerelationtargetlinkpmc_iddoi_id
00While blocking TPC2 activity by tetrandrine, a...{'(+)-Tetrandrine': {'namespace': 'chebi', 'na...negativeCorrelation{'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...{'annotations': {}, 'citation': {'authors': ['...32221306.0NaN
11Chemoinformatics searches yielded 15 approved ...{'(S)-verapamil': {'namespace': 'chebi', 'name...negativeCorrelation{'hypertension': {'namespace': 'doid', 'name':...{'annotations': {}, 'citation': {'db': 'DOI', ...NaNhttps://doi.org/10.1101/2020.03.22.002386
22Thyroid stimulating hormone and free triiodoth...{\"3,3',5'-triiodothyronine\": {'namespace': 'ch...negativeCorrelation{'COVID-19': {'namespace': 'doid', 'name': 'CO...{'annotations': {'mesh': {'D044967': True}}, '...32217556.0NaN
33Based on these results, we performed virtual d...{\"4'-epidoxorubicin\": {'namespace': 'chebi', '...decreases{'3.4.22.69': {'namespace': 'eccode', 'name': ...{'annotations': {}, 'citation': {'authors': ['...32173287.0NaN
44Doctors can also use a clinically approved bil...{'4-methylumbelliferone': {'namespace': 'chebi...decreases{'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',...{'annotations': {'mesh': {'D008168': True}}, '...32205856.0NaN
55Since Vitamin B3 is highly lung protective, it...{'4-methylumbelliferone': {'namespace': 'chebi...decreases{'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',...{'annotations': {}, 'citation': {'authors': ['...32205856.0NaN
66Doctors can also use a clinically approved bil...{'4-methylumbelliferone': {'namespace': 'chebi...decreases{'inflammatory response': {'namespace': 'go', ...{'annotations': {'mesh': {'D008168': True}}, '...32205856.0NaN
\n", - "
" - ], - "text/plain": [ - " Unnamed: 0 text \\\n", - "0 0 While blocking TPC2 activity by tetrandrine, a... \n", - "1 1 Chemoinformatics searches yielded 15 approved ... \n", - "2 2 Thyroid stimulating hormone and free triiodoth... \n", - "3 3 Based on these results, we performed virtual d... \n", - "4 4 Doctors can also use a clinically approved bil... \n", - "5 5 Since Vitamin B3 is highly lung protective, it... \n", - "6 6 Doctors can also use a clinically approved bil... \n", - "\n", - " source relation \\\n", - "0 {'(+)-Tetrandrine': {'namespace': 'chebi', 'na... negativeCorrelation \n", - "1 {'(S)-verapamil': {'namespace': 'chebi', 'name... negativeCorrelation \n", - "2 {\"3,3',5'-triiodothyronine\": {'namespace': 'ch... negativeCorrelation \n", - "3 {\"4'-epidoxorubicin\": {'namespace': 'chebi', '... decreases \n", - "4 {'4-methylumbelliferone': {'namespace': 'chebi... decreases \n", - "5 {'4-methylumbelliferone': {'namespace': 'chebi... decreases \n", - "6 {'4-methylumbelliferone': {'namespace': 'chebi... decreases \n", - "\n", - " target \\\n", - "0 {'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'... \n", - "1 {'hypertension': {'namespace': 'doid', 'name':... \n", - "2 {'COVID-19': {'namespace': 'doid', 'name': 'CO... \n", - "3 {'3.4.22.69': {'namespace': 'eccode', 'name': ... \n", - "4 {'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',... \n", - "5 {'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',... \n", - "6 {'inflammatory response': {'namespace': 'go', ... \n", - "\n", - " link pmc_id \\\n", - "0 {'annotations': {}, 'citation': {'authors': ['... 32221306.0 \n", - "1 {'annotations': {}, 'citation': {'db': 'DOI', ... NaN \n", - "2 {'annotations': {'mesh': {'D044967': True}}, '... 32217556.0 \n", - "3 {'annotations': {}, 'citation': {'authors': ['... 32173287.0 \n", - "4 {'annotations': {'mesh': {'D008168': True}}, '... 32205856.0 \n", - "5 {'annotations': {}, 'citation': {'authors': ['... 32205856.0 \n", - "6 {'annotations': {'mesh': {'D008168': True}}, '... 32205856.0 \n", - "\n", - " doi_id \n", - "0 NaN \n", - "1 https://doi.org/10.1101/2020.03.22.002386 \n", - "2 NaN \n", - "3 NaN \n", - "4 NaN \n", - "5 NaN \n", - "6 NaN " - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#'https://raw.githubusercontent.com/covid19kg/covid19kg/master/supplement/terminology.csv'\n", - "url = 'https://raw.githubusercontent.com/CoronaWhy/task-vt/kaleidoescape_kg/immunology_kg/relations/covid19_frauenhofer_annotations.csv'\n", - "pybel_pd = pd.read_csv(url)\n", - "pybel_pd.head(7)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array(['negativeCorrelation', 'decreases', 'regulates', 'increases',\n", - " 'positiveCorrelation', 'association', 'isA', 'biomarkerFor',\n", - " 'prognosticBiomarkerFor', 'causesNoChange'], dtype=object)" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#list all types of relations\n", - "relation_categories = pybel_pd['relation'].unique()\n", - "relation_categories" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 2. Snorkel Example\n", - "\n", - "For the purpose of this example we'll only focus on rows with 'negativeCorrelation' and 'positiveCorrelation' as their relations." - ] - }, - { - "cell_type": "code", - "execution_count": 131, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\Cafral\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:2: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " \n", - "C:\\Users\\Cafral\\Anaconda3\\lib\\site-packages\\pandas\\core\\frame.py:4102: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame\n", - "\n", - "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " errors=errors,\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
textsourcerelationtargetlinkpmc_iddoi_id
0While blocking TPC2 activity by tetrandrine, a...{'(+)-Tetrandrine': {'namespace': 'chebi', 'na...True{'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...{'annotations': {}, 'citation': {'authors': ['...32221306.0NaN
1Chemoinformatics searches yielded 15 approved ...{'(S)-verapamil': {'namespace': 'chebi', 'name...True{'hypertension': {'namespace': 'doid', 'name':...{'annotations': {}, 'citation': {'db': 'DOI', ...NaNhttps://doi.org/10.1101/2020.03.22.002386
2Thyroid stimulating hormone and free triiodoth...{\"3,3',5'-triiodothyronine\": {'namespace': 'ch...True{'COVID-19': {'namespace': 'doid', 'name': 'CO...{'annotations': {'mesh': {'D044967': True}}, '...32217556.0NaN
3The administration of methylprednisolone appea...{'6-methylprednisolone': {'namespace': 'chebi'...True{'Death': {'namespace': 'mesh', 'name': 'Death...{'annotations': {'doid': {'11394': True}}, 'ci...32167524.0NaN
4Adverse reactions of IFN-α mainly include low-...{'Interferon alfa-2a': {'namespace': 'chebi', ...False{'Low-grade fever': {'namespace': 'hp', 'name'...{'annotations': {}, 'citation': {'authors': ['...32166483.0NaN
\n", - "
" - ], - "text/plain": [ - " text \\\n", - "0 While blocking TPC2 activity by tetrandrine, a... \n", - "1 Chemoinformatics searches yielded 15 approved ... \n", - "2 Thyroid stimulating hormone and free triiodoth... \n", - "3 The administration of methylprednisolone appea... \n", - "4 Adverse reactions of IFN-α mainly include low-... \n", - "\n", - " source relation \\\n", - "0 {'(+)-Tetrandrine': {'namespace': 'chebi', 'na... True \n", - "1 {'(S)-verapamil': {'namespace': 'chebi', 'name... True \n", - "2 {\"3,3',5'-triiodothyronine\": {'namespace': 'ch... True \n", - "3 {'6-methylprednisolone': {'namespace': 'chebi'... True \n", - "4 {'Interferon alfa-2a': {'namespace': 'chebi', ... False \n", - "\n", - " target \\\n", - "0 {'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'... \n", - "1 {'hypertension': {'namespace': 'doid', 'name':... \n", - "2 {'COVID-19': {'namespace': 'doid', 'name': 'CO... \n", - "3 {'Death': {'namespace': 'mesh', 'name': 'Death... \n", - "4 {'Low-grade fever': {'namespace': 'hp', 'name'... \n", - "\n", - " link pmc_id \\\n", - "0 {'annotations': {}, 'citation': {'authors': ['... 32221306.0 \n", - "1 {'annotations': {}, 'citation': {'db': 'DOI', ... NaN \n", - "2 {'annotations': {'mesh': {'D044967': True}}, '... 32217556.0 \n", - "3 {'annotations': {'doid': {'11394': True}}, 'ci... 32167524.0 \n", - "4 {'annotations': {}, 'citation': {'authors': ['... 32166483.0 \n", - "\n", - " doi_id \n", - "0 NaN \n", - "1 https://doi.org/10.1101/2020.03.22.002386 \n", - "2 NaN \n", - "3 NaN \n", - "4 NaN " - ] - }, - "execution_count": 131, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "example_data = pybel_pd[(pybel_pd['relation']=='negativeCorrelation') | (pybel_pd['relation']=='positiveCorrelation') ]\n", - "example_data['relation'] = example_data['relation']=='negativeCorrelation'\n", - "example_data.reset_index(inplace=True,drop=True)\n", - "example_data.drop('Unnamed: 0',inplace=True,axis=1)\n", - "example_data.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2.1 Split the data into training and testing \n", - "\n", - "Ideally should have training , validation and testing set. Also, here , I'm using a fixed testing period but k-fold cross validation techniques are a more robust way of determining the accuracy of the generated labels." - ] - }, - { - "cell_type": "code", - "execution_count": 132, - "metadata": {}, - "outputs": [], - "source": [ - "df_train,df_test,y_train,y_test = train_test_split(example_data[['text']],example_data[['relation']],test_size=0.20,shuffle=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2.2 Reading sentences to understand syntactic differences betweem negative and positive correlation sentences\n", - "\n", - "The utility of snorkel is that it allows you to create multiple labelling functions which (try to) mimic the rules that a human annotator of data would apply while deciding how to label unlabelled data. For instance a human annotator looking to identify negative correlation is sentences will follow the following rules:\n", - "\n", - "##### A. Positive rules: rules which tell you what a sentences with negative correlation looks like\n", - "\n", - "1) does the sentence contain the reduction related words like words 'decreased','reduced','lowered'\n", - "\n", - "2) does the sentence contain the expression 'negatively correlated' or 'negative correlation' or 'negatively related' or 'inversely related' or 'inverse relation'\n", - "\n", - "3) does the syntax follow the structure 'increase in x is associated with a decrease in y' (and vice versa)\n", - "\n", - "4) does the sentence contain the expression 'negative effect'\n", - "\n", - "5) does the sentence contain the expression 'move in opposite directions'\n", - "\n", - "##### B. Negative rules: rules which tell you what a sentences without negative correlation looks like\n", - "\n", - "1) does the sentence contain the increase related words like words 'increased','improved'\n", - "\n", - "2) does the sentence contain the expression 'positively correlated' or 'positive correlation' or 'positively related'\n", - "\n", - "3) does the syntax follow the structure 'increase in x is associated with a increase in y' (and vice versa)\n", - "\n", - "4) does the sentence contain the expression 'positive effect'\n", - "\n", - "5) does the sentence contain the expression 'move in the same direction'\n", - "\n", - "These rules can be coded using snorkel. Importantly it requires both positive and negative rules. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 2.2.1 Examining Negative correlation sentences to understand their syntactic structure and then define labelling functions accordingly" - ] - }, - { - "cell_type": "code", - "execution_count": 133, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
textsourcerelationtargetlinkpmc_iddoi_id
0While blocking TPC2 activity by tetrandrine, a...{'(+)-Tetrandrine': {'namespace': 'chebi', 'na...True{'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...{'annotations': {}, 'citation': {'authors': ['...32221306.0NaN
1Chemoinformatics searches yielded 15 approved ...{'(S)-verapamil': {'namespace': 'chebi', 'name...True{'hypertension': {'namespace': 'doid', 'name':...{'annotations': {}, 'citation': {'db': 'DOI', ...NaNhttps://doi.org/10.1101/2020.03.22.002386
2Thyroid stimulating hormone and free triiodoth...{\"3,3',5'-triiodothyronine\": {'namespace': 'ch...True{'COVID-19': {'namespace': 'doid', 'name': 'CO...{'annotations': {'mesh': {'D044967': True}}, '...32217556.0NaN
3The administration of methylprednisolone appea...{'6-methylprednisolone': {'namespace': 'chebi'...True{'Death': {'namespace': 'mesh', 'name': 'Death...{'annotations': {'doid': {'11394': True}}, 'ci...32167524.0NaN
4In our opinion, during the COVID-19 pandemic, ...{'adrenergic antagonist': {'namespace': 'chebi...True{'COVID-19': {'namespace': 'doid', 'name': 'CO...{'annotations': {}, 'citation': {'authors': ['...32220710.0NaN
5Consistent with previous reports, 20mM NH4Cl a...{'ammonium chloride': {'namespace': 'chebi', '...True{'G protein, vesicular stomatitis virus': {'na...{'annotations': {}, 'citation': {'authors': ['...32221306.0NaN
6If the latter percentage would be found to be ...{'angiotensin receptor antagonist': {'namespac...True{'COVID-19': {'namespace': 'doid', 'name': 'CO...{'annotations': {}, 'citation': {'authors': ['...32129518.0NaN
7Consistent with previous reports, 20mM NH4Cl a...{'bafilomycin A1': {'namespace': 'chebi', 'nam...True{'G protein, vesicular stomatitis virus': {'na...{'annotations': {}, 'citation': {'authors': ['...32221306.0NaN
\n", - "
" - ], - "text/plain": [ - " text \\\n", - "0 While blocking TPC2 activity by tetrandrine, a... \n", - "1 Chemoinformatics searches yielded 15 approved ... \n", - "2 Thyroid stimulating hormone and free triiodoth... \n", - "3 The administration of methylprednisolone appea... \n", - "4 In our opinion, during the COVID-19 pandemic, ... \n", - "5 Consistent with previous reports, 20mM NH4Cl a... \n", - "6 If the latter percentage would be found to be ... \n", - "7 Consistent with previous reports, 20mM NH4Cl a... \n", - "\n", - " source relation \\\n", - "0 {'(+)-Tetrandrine': {'namespace': 'chebi', 'na... True \n", - "1 {'(S)-verapamil': {'namespace': 'chebi', 'name... True \n", - "2 {\"3,3',5'-triiodothyronine\": {'namespace': 'ch... True \n", - "3 {'6-methylprednisolone': {'namespace': 'chebi'... True \n", - "4 {'adrenergic antagonist': {'namespace': 'chebi... True \n", - "5 {'ammonium chloride': {'namespace': 'chebi', '... True \n", - "6 {'angiotensin receptor antagonist': {'namespac... True \n", - "7 {'bafilomycin A1': {'namespace': 'chebi', 'nam... True \n", - "\n", - " target \\\n", - "0 {'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'... \n", - "1 {'hypertension': {'namespace': 'doid', 'name':... \n", - "2 {'COVID-19': {'namespace': 'doid', 'name': 'CO... \n", - "3 {'Death': {'namespace': 'mesh', 'name': 'Death... \n", - "4 {'COVID-19': {'namespace': 'doid', 'name': 'CO... \n", - "5 {'G protein, vesicular stomatitis virus': {'na... \n", - "6 {'COVID-19': {'namespace': 'doid', 'name': 'CO... \n", - "7 {'G protein, vesicular stomatitis virus': {'na... \n", - "\n", - " link pmc_id \\\n", - "0 {'annotations': {}, 'citation': {'authors': ['... 32221306.0 \n", - "1 {'annotations': {}, 'citation': {'db': 'DOI', ... NaN \n", - "2 {'annotations': {'mesh': {'D044967': True}}, '... 32217556.0 \n", - "3 {'annotations': {'doid': {'11394': True}}, 'ci... 32167524.0 \n", - "4 {'annotations': {}, 'citation': {'authors': ['... 32220710.0 \n", - "5 {'annotations': {}, 'citation': {'authors': ['... 32221306.0 \n", - "6 {'annotations': {}, 'citation': {'authors': ['... 32129518.0 \n", - "7 {'annotations': {}, 'citation': {'authors': ['... 32221306.0 \n", - "\n", - " doi_id \n", - "0 NaN \n", - "1 https://doi.org/10.1101/2020.03.22.002386 \n", - "2 NaN \n", - "3 NaN \n", - "4 NaN \n", - "5 NaN \n", - "6 NaN \n", - "7 NaN " - ] - }, - "execution_count": 133, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "neg_correl_df = example_data[example_data['relation']==1]\n", - "neg_correl_df.reset_index(inplace=True,drop=True)\n", - "neg_correl_df.head(8)" - ] - }, - { - "cell_type": "code", - "execution_count": 134, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'While blocking TPC2 activity by tetrandrine, an inhibitor for TPC237, decreased entry of SARS-CoV-2 S pseudovirions (Fig. 3f), treatment of cells with 130, a TRPML1 inhibitor, had no effect (Supplementary Fig. 1), indicating that TPC2, not TRPML1, is important for SARS-CoV-2 entry.'" - ] - }, - "execution_count": 134, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "neg_correl_df['text'][0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 2.2.2 Examining positive correlation sentences to understand their syntactic structure and then define labelling funtions accordingly" - ] - }, - { - "cell_type": "code", - "execution_count": 135, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
textsourcerelationtargetlinkpmc_iddoi_id
0Adverse reactions of IFN-α mainly include low-...{'Interferon alfa-2a': {'namespace': 'chebi', ...False{'Low-grade fever': {'namespace': 'hp', 'name'...{'annotations': {}, 'citation': {'authors': ['...32166483.0NaN
1Adverse reactions of IFN-α mainly include low-...{'Interferon alfa-2a': {'namespace': 'chebi', ...False{'influenza': {'namespace': 'doid', 'name': 'i...{'annotations': {}, 'citation': {'authors': ['...32166483.0NaN
2This may be accounted for by two complementary...{'angiotensin II': {'namespace': 'chebi', 'nam...False{'COVID-19': {'namespace': 'doid', 'name': 'CO...{'annotations': {}, 'citation': {'authors': ['...32129518.0NaN
3ACE2 can also antagonize cardiac fibrosis and ...{'angiotensin II': {'namespace': 'chebi', 'nam...False{'Ventricular Remodeling': {'namespace': 'mesh...{'annotations': {}, 'citation': {'authors': ['...32221983.0NaN
4ACE2 can also antagonize cardiac fibrosis and ...{'angiotensin II': {'namespace': 'chebi', 'nam...False{'Myocardial fibrosis': {'namespace': 'hp', 'n...{'annotations': {}, 'citation': {'authors': ['...32221983.0NaN
5The existence of significantly increased fibri...{'Fibrin': {'namespace': 'chebi', 'name': 'Fib...False{'Hyperfibrinolysis': {'namespace': 'hp', 'nam...{'annotations': {}, 'citation': {'authors': ['...32216698.0NaN
6This opinion is supported by the presence of h...{'Fibrin': {'namespace': 'chebi', 'name': 'Fib...False{'Hemorrhage': {'namespace': 'mesh', 'name': '...{'annotations': {}, 'citation': {'authors': ['...32216698.0NaN
7In the influenza virus model, it was reported ...{'chloroquine': {'namespace': 'chebi', 'name':...False{'dendritic cell antigen processing and presen...{'annotations': {'mesh': {'D007251': True}}, '...32171740.0NaN
\n", - "
" - ], - "text/plain": [ - " text \\\n", - "0 Adverse reactions of IFN-α mainly include low-... \n", - "1 Adverse reactions of IFN-α mainly include low-... \n", - "2 This may be accounted for by two complementary... \n", - "3 ACE2 can also antagonize cardiac fibrosis and ... \n", - "4 ACE2 can also antagonize cardiac fibrosis and ... \n", - "5 The existence of significantly increased fibri... \n", - "6 This opinion is supported by the presence of h... \n", - "7 In the influenza virus model, it was reported ... \n", - "\n", - " source relation \\\n", - "0 {'Interferon alfa-2a': {'namespace': 'chebi', ... False \n", - "1 {'Interferon alfa-2a': {'namespace': 'chebi', ... False \n", - "2 {'angiotensin II': {'namespace': 'chebi', 'nam... False \n", - "3 {'angiotensin II': {'namespace': 'chebi', 'nam... False \n", - "4 {'angiotensin II': {'namespace': 'chebi', 'nam... False \n", - "5 {'Fibrin': {'namespace': 'chebi', 'name': 'Fib... False \n", - "6 {'Fibrin': {'namespace': 'chebi', 'name': 'Fib... False \n", - "7 {'chloroquine': {'namespace': 'chebi', 'name':... False \n", - "\n", - " target \\\n", - "0 {'Low-grade fever': {'namespace': 'hp', 'name'... \n", - "1 {'influenza': {'namespace': 'doid', 'name': 'i... \n", - "2 {'COVID-19': {'namespace': 'doid', 'name': 'CO... \n", - "3 {'Ventricular Remodeling': {'namespace': 'mesh... \n", - "4 {'Myocardial fibrosis': {'namespace': 'hp', 'n... \n", - "5 {'Hyperfibrinolysis': {'namespace': 'hp', 'nam... \n", - "6 {'Hemorrhage': {'namespace': 'mesh', 'name': '... \n", - "7 {'dendritic cell antigen processing and presen... \n", - "\n", - " link pmc_id doi_id \n", - "0 {'annotations': {}, 'citation': {'authors': ['... 32166483.0 NaN \n", - "1 {'annotations': {}, 'citation': {'authors': ['... 32166483.0 NaN \n", - "2 {'annotations': {}, 'citation': {'authors': ['... 32129518.0 NaN \n", - "3 {'annotations': {}, 'citation': {'authors': ['... 32221983.0 NaN \n", - "4 {'annotations': {}, 'citation': {'authors': ['... 32221983.0 NaN \n", - "5 {'annotations': {}, 'citation': {'authors': ['... 32216698.0 NaN \n", - "6 {'annotations': {}, 'citation': {'authors': ['... 32216698.0 NaN \n", - "7 {'annotations': {'mesh': {'D007251': True}}, '... 32171740.0 NaN " - ] - }, - "execution_count": 135, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "positive_relation_df = example_data[example_data['relation']==0]\n", - "positive_relation_df.reset_index(inplace=True,drop=True)\n", - "positive_relation_df.head(8)" - ] - }, - { - "cell_type": "code", - "execution_count": 136, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Adverse reactions of IFN-α mainly include low-grade fever and flu-like symptoms (both in children with intramuscularly injection) [11].'" - ] - }, - "execution_count": 136, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "positive_relation_df['text'][0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2.3 Source-Target dictionary" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A simple but clean rule for identifying negative correlation sentences would be if negative tokens occured in the words between the source and the target. So, a source-target dictonary is created for some of the examples (in the final pipeline the source target dictonary will be obtained from the spacy pipeline)." - ] - }, - { - "cell_type": "code", - "execution_count": 137, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[('tetrandrine', 'TPC2'),\n", - " ('triiodothyronine', 'recovered'),\n", - " ('methylprednisolone', 'death'),\n", - " ('IFN-α', 'fever'),\n", - " ('angiotensin', 'vasodilator'),\n", - " ('ACE2', 'Ang'),\n", - " ('fibrin', 'COVID-19'),\n", - " ('hemorrhage', 'fibrinolysis'),\n", - " ('chloroquine', 'dendritic')]" - ] - }, - "execution_count": 137, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "example_sources = ['tetrandrine','triiodothyronine','methylprednisolone','IFN-α','angiotensin','ACE2','fibrin','hemorrhage','chloroquine']\n", - "\n", - "example_targets = ['TPC2','recovered','death','fever','vasodilator','Ang','COVID-19','fibrinolysis','dendritic'] #low-grade fever\n", - "\n", - "example_source_target_dict = list(OrderedDict.fromkeys(zip(example_sources,example_targets)))\n", - "example_source_target_dict" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2.4 Labeling functions for RE\n", - "\n", - "#### 2.4.1 Preprocessing" - ] - }, - { - "cell_type": "code", - "execution_count": 138, - "metadata": {}, - "outputs": [], - "source": [ - "spacy = SpacyPreprocessor(text_field=\"text\", doc_field=\"doc\", memoize=True)\n", - "\n", - "ABSTAIN = -1\n", - "NOT_FOUND = 0\n", - "FOUND = 1" - ] - }, - { - "cell_type": "code", - "execution_count": 139, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Source target pair: ('tetrandrine', 'TPC2')\n" - ] - } - ], - "source": [ - "@preprocessor(pre=[spacy])\n", - "def get_source_target(cand: DataPoint) -> DataPoint:\n", - " \"\"\"\n", - " Returns the source and target mentioned in the sentence\n", - " \"\"\"\n", - " person_names = []\n", - "\n", - " source = [token.text for token in cand.doc if token.text in example_sources]\n", - " target = [token.text for token in cand.doc if token.text in example_targets]\n", - " \n", - " try:\n", - " cand.source_target = (source[0],target[0])\n", - " except:\n", - " cand.source_target = (np.nan,np.nan)\n", - " return cand\n", - "\n", - "########### function example #####################\n", - "\n", - "candidate = example_data.loc[0]\n", - "candidate_with_function_applied = get_source_target(candidate) \n", - "\n", - "print(\"Source target pair: \", candidate_with_function_applied.source_target)" - ] - }, - { - "cell_type": "code", - "execution_count": 140, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sentence: Thyroid stimulating hormone and free triiodothyronine concentrations were significantly lower in deceased patients (0.7 mIU/mL and 2.8 pmol/L) than in recovered patients (1.4 mIU/mL and 4.3 pmol/L).\n", - "Text Between: triiodothyronine concentrations were significantly lower in deceased patients (0.7 mIU/mL and 2.8 pmol/L) than in\n", - "Text Between: Thyroid stimulating hormone and free\n" - ] - } - ], - "source": [ - "@preprocessor(pre=[spacy])\n", - "def get_text_between(cand: DataPoint) -> DataPoint:\n", - " \"\"\"\n", - " Returns the text between a source target pair\n", - " \"\"\"\n", - " person_names = []\n", - "\n", - " source_idx = [token.i for token in cand.doc if token.text in example_sources]\n", - " target_idx = [token.i for token in cand.doc if token.text in example_targets]\n", - " \n", - " try:\n", - "\n", - " if (len(target_idx)==1) & (len(source_idx)==1) & (source_idx[0]1) & (len(source_idx)==1):\n", - " for target_index in target_idx:\n", - " if source_idx[0]1) & (len(target_idx)==1):\n", - " for source_index in source_idx:\n", - " if source_index1) & (len(target_idx)>1):\n", - " for source_index in source_idx:\n", - " for target_index in target_idx:\n", - " if source_index 0 else ABSTAIN\n", - "\n", - "#positive rule - version 2\n", - "@labeling_function(pre=[spacy,get_text_between],resources=dict(reduction_tokens=reduction_tokens))\n", - "def contains_reduction_tokens_text_between(x,reduction_tokens):\n", - " relation_text = x.text_between\n", - " relation_text_tokens = [str(token) for token in relation_text]\n", - " return FOUND if len(reduction_tokens.intersection(set(relation_text_tokens))) > 0 else ABSTAIN\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "2) does the sentence contain the expression 'negatively correlated' or 'negative correlation' or 'negatively related' or 'inversely related' or 'inverse relation'" - ] - }, - { - "cell_type": "code", - "execution_count": 142, - "metadata": {}, - "outputs": [], - "source": [ - "negative_correlation_regex_1 = 'negative correlation'\n", - "negative_correlation_regex_2 = 'negatively correlated'\n", - "negative_correlation_regex_3 = 'negatively related'\n", - "negative_correlation_regex_4 = 'inversely related'\n", - "negative_correlation_regex_5 = 'inverse relation'\n", - "negative_correlation_regex_6 = 'negative effect'\n", - "negative_correlation_regex_7 = 'move in opposite directions'\n", - "\n", - "@labeling_function()\n", - "def contains_negative_corrrelation_regex(x):\n", - " if re.search(negative_correlation_regex_1, x.text, flags=re.I):\n", - " return FOUND\n", - " elif re.search(negative_correlation_regex_2, x.text, flags=re.I):\n", - " return FOUND\n", - " elif re.search(negative_correlation_regex_3, x.text, flags=re.I):\n", - " return FOUND\n", - " elif re.search(negative_correlation_regex_4, x.text, flags=re.I):\n", - " return FOUND\n", - " elif re.search(negative_correlation_regex_5, x.text, flags=re.I):\n", - " return FOUND\n", - " elif re.search(negative_correlation_regex_6, x.text, flags=re.I):\n", - " return FOUND\n", - " elif re.search(negative_correlation_regex_7, x.text, flags=re.I):\n", - " return FOUND\n", - "\n", - " else: \n", - " return ABSTAIN\n", - " " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "3) does the syntax follow the structure 'increase in x is associated with a decrease in y' (and vice versa)" - ] - }, - { - "cell_type": "code", - "execution_count": 143, - "metadata": {}, - "outputs": [], - "source": [ - "@labeling_function(pre=[spacy,get_text_between])\n", - "def contains_increase_decrease_pattern(x):\n", - " if ('increase' in [str(token) for token in x.text_to_source_left]) & ('decrease' in [str(token) for token in x.text_between]):\n", - " return FOUND\n", - " elif ('decrease' in [str(token) for token in x.text_to_source_left]) & ('increase' in [str(token) for token in x.text_between]):\n", - " return FOUND\n", - " else:\n", - " return ABSTAIN" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### B. Negative rules: rules which tell you what a sentences without negative correlation looks like\n", - "\n", - "1) does the sentence contain the increase related words like words 'increased','higher'" - ] - }, - { - "cell_type": "code", - "execution_count": 144, - "metadata": {}, - "outputs": [], - "source": [ - "increase_tokens = {'increased',\n", - " 'higher',\n", - "}\n", - "\n", - "@labeling_function(pre=[spacy],resources=dict(increase_tokens=increase_tokens))\n", - "def contains_increase_tokens(x,increase_tokens):\n", - " tokens = [str(token) for token in x.doc]\n", - " return NOT_FOUND if len(increase_tokens.intersection(set(tokens))) > 0 else ABSTAIN\n", - "\n", - "\n", - "@labeling_function(pre=[spacy,get_text_between],resources=dict(increase_tokens=increase_tokens))\n", - "def contains_increase_tokens_text_between(x, increase_tokens):\n", - " relation_text = x.text_between\n", - " relation_text_tokens = [str(token) for token in relation_text]\n", - " return NOT_FOUND if len(increase_tokens.intersection(set(relation_text_tokens))) > 0 else ABSTAIN" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "2) does the sentence contain the expression 'positively correlated' or 'positive correlation' or 'positively related' or 'positive effect' or 'move in the same direction'" - ] - }, - { - "cell_type": "code", - "execution_count": 145, - "metadata": {}, - "outputs": [], - "source": [ - "#regex\n", - "positive_correlation_regex_1 = 'positive correlation'\n", - "positive_correlation_regex_2 = 'positively correlated'\n", - "positive_correlation_regex_3 = 'positively related'\n", - "positive_correlation_regex_4 = 'positive effect'\n", - "positive_correlation_regex_5 = 'move in the same direction'\n", - "\n", - "@labeling_function()\n", - "def contains_positive_corrrelation_regex(x):\n", - " if re.search(positive_correlation_regex_1, x.text, flags=re.I):\n", - " return FOUND\n", - " elif re.search(positive_correlation_regex_2, x.text, flags=re.I):\n", - " return FOUND\n", - " elif re.search(positive_correlation_regex_3, x.text, flags=re.I):\n", - " return FOUND\n", - " elif re.search(positive_correlation_regex_4, x.text, flags=re.I):\n", - " return FOUND\n", - " elif re.search(positive_correlation_regex_5, x.text, flags=re.I):\n", - " return FOUND \n", - " else:\n", - " return ABSTAIN" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "3) does the syntax follow the structure 'increase in x is associated with a increase in y' (and vice versa)" - ] - }, - { - "cell_type": "code", - "execution_count": 146, - "metadata": {}, - "outputs": [], - "source": [ - "@labeling_function(pre=[spacy,get_text_between])\n", - "def contains_increase_increase_pattern(x):\n", - " if ('increase' in [str(token) for token in x.text_to_source_left]) & ('increase' in [str(token) for token in x.text_between]):\n", - " return FOUND\n", - " elif ('decrease' in [str(token) for token in x.text_to_source_left]) & ('decrease' in [str(token) for token in x.text_between]):\n", - " return FOUND\n", - " else:\n", - " return ABSTAIN" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2.5 Creating all the labels for the different rules" - ] - }, - { - "cell_type": "code", - "execution_count": 147, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\Cafral\\Anaconda3\\lib\\site-packages\\tqdm\\std.py:658: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n", - " from pandas import Panel\n", - "\n", - " 0%| | 0/2227 [00:00\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
jPolarityCoverageOverlapsConflicts
contains_reduction_tokens0[1]0.0570270.0193080.016165
contains_reduction_tokens_text_between1[1]0.0040410.0040410.000898
contains_negative_corrrelation_regex2[1]0.0085320.0000000.000000
contains_increase_decrease_pattern3[]0.0000000.0000000.000000
contains_increase_tokens4[0]0.1302200.0161650.016165
contains_increase_tokens_text_between5[]0.0000000.0000000.000000
contains_positive_corrrelation_regex6[1]0.0022450.0000000.000000
contains_increase_increase_pattern7[]0.0000000.0000000.000000
\n", - "" - ], - "text/plain": [ - " j Polarity Coverage Overlaps \\\n", - "contains_reduction_tokens 0 [1] 0.057027 0.019308 \n", - "contains_reduction_tokens_text_between 1 [1] 0.004041 0.004041 \n", - "contains_negative_corrrelation_regex 2 [1] 0.008532 0.000000 \n", - "contains_increase_decrease_pattern 3 [] 0.000000 0.000000 \n", - "contains_increase_tokens 4 [0] 0.130220 0.016165 \n", - "contains_increase_tokens_text_between 5 [] 0.000000 0.000000 \n", - "contains_positive_corrrelation_regex 6 [1] 0.002245 0.000000 \n", - "contains_increase_increase_pattern 7 [] 0.000000 0.000000 \n", - "\n", - " Conflicts \n", - "contains_reduction_tokens 0.016165 \n", - "contains_reduction_tokens_text_between 0.000898 \n", - "contains_negative_corrrelation_regex 0.000000 \n", - "contains_increase_decrease_pattern 0.000000 \n", - "contains_increase_tokens 0.016165 \n", - "contains_increase_tokens_text_between 0.000000 \n", - "contains_positive_corrrelation_regex 0.000000 \n", - "contains_increase_increase_pattern 0.000000 " - ] - }, - "execution_count": 148, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#in the absence of a benchmark to compare against\n", - "LFAnalysis(L=label_matrix_train, lfs=label_functions_list).lf_summary()" - ] - }, - { - "cell_type": "code", - "execution_count": 149, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
jPolarityCoverageOverlapsConflictsCorrectIncorrectEmp. Acc.
contains_reduction_tokens0[1]0.0570270.0193080.01616569580.543307
contains_reduction_tokens_text_between1[1]0.0040410.0040410.000898810.888889
contains_negative_corrrelation_regex2[1]0.0085320.0000000.0000001810.947368
contains_increase_decrease_pattern3[]0.0000000.0000000.000000000.000000
contains_increase_tokens4[0]0.1302200.0161650.016165263270.906897
contains_increase_tokens_text_between5[]0.0000000.0000000.000000000.000000
contains_positive_corrrelation_regex6[1]0.0022450.0000000.000000050.000000
contains_increase_increase_pattern7[]0.0000000.0000000.000000000.000000
\n", - "
" - ], - "text/plain": [ - " j Polarity Coverage Overlaps \\\n", - "contains_reduction_tokens 0 [1] 0.057027 0.019308 \n", - "contains_reduction_tokens_text_between 1 [1] 0.004041 0.004041 \n", - "contains_negative_corrrelation_regex 2 [1] 0.008532 0.000000 \n", - "contains_increase_decrease_pattern 3 [] 0.000000 0.000000 \n", - "contains_increase_tokens 4 [0] 0.130220 0.016165 \n", - "contains_increase_tokens_text_between 5 [] 0.000000 0.000000 \n", - "contains_positive_corrrelation_regex 6 [1] 0.002245 0.000000 \n", - "contains_increase_increase_pattern 7 [] 0.000000 0.000000 \n", - "\n", - " Conflicts Correct Incorrect \\\n", - "contains_reduction_tokens 0.016165 69 58 \n", - "contains_reduction_tokens_text_between 0.000898 8 1 \n", - "contains_negative_corrrelation_regex 0.000000 18 1 \n", - "contains_increase_decrease_pattern 0.000000 0 0 \n", - "contains_increase_tokens 0.016165 263 27 \n", - "contains_increase_tokens_text_between 0.000000 0 0 \n", - "contains_positive_corrrelation_regex 0.000000 0 5 \n", - "contains_increase_increase_pattern 0.000000 0 0 \n", - "\n", - " Emp. Acc. \n", - "contains_reduction_tokens 0.543307 \n", - "contains_reduction_tokens_text_between 0.888889 \n", - "contains_negative_corrrelation_regex 0.947368 \n", - "contains_increase_decrease_pattern 0.000000 \n", - "contains_increase_tokens 0.906897 \n", - "contains_increase_tokens_text_between 0.000000 \n", - "contains_positive_corrrelation_regex 0.000000 \n", - "contains_increase_increase_pattern 0.000000 " - ] - }, - "execution_count": 149, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#examining the quality of the labels in the presence of a benchmark to compare against\n", - "LFAnalysis(L=label_matrix_train, lfs=label_functions_list).lf_summary(y_train.values)" - ] - }, - { - "cell_type": "code", - "execution_count": 150, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
text
0While blocking TPC2 activity by tetrandrine, a...
2Thyroid stimulating hormone and free triiodoth...
3The administration of methylprednisolone appea...
7Consistent with previous reports, 20mM NH4Cl a...
12Consistent with previous reports, 20mM NH4Cl a...
......
1644Actual bicarbonate and total carbon dioxide co...
1655Albumin concentrations were significantly lowe...
1657Moreover, the frequencies of regulatory T cell...
1658The reduced expressions of interferon-γ (IFN-γ...
1668Spleen atrophy was observed in all reported ca...
\n", - "

127 rows × 1 columns

\n", - "
" - ], - "text/plain": [ - " text\n", - "0 While blocking TPC2 activity by tetrandrine, a...\n", - "2 Thyroid stimulating hormone and free triiodoth...\n", - "3 The administration of methylprednisolone appea...\n", - "7 Consistent with previous reports, 20mM NH4Cl a...\n", - "12 Consistent with previous reports, 20mM NH4Cl a...\n", - "... ...\n", - "1644 Actual bicarbonate and total carbon dioxide co...\n", - "1655 Albumin concentrations were significantly lowe...\n", - "1657 Moreover, the frequencies of regulatory T cell...\n", - "1658 The reduced expressions of interferon-γ (IFN-γ...\n", - "1668 Spleen atrophy was observed in all reported ca...\n", - "\n", - "[127 rows x 1 columns]" - ] - }, - "execution_count": 150, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#examine which sentences were picked up as showing negative correlation by each label function\n", - "df_train.iloc[label_matrix_train[:, 0] == FOUND]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2.7 Predict the final label\n", - "\n", - "Different models can be used to create the final model that aggrateges the different label functions to perdict the final lebel." - ] - }, - { - "cell_type": "code", - "execution_count": 151, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\Cafral\\Anaconda3\\lib\\site-packages\\tqdm\\std.py:658: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n", - " from pandas import Panel\n", - "\n", - " 0%| | 0/557 [00:00 0.5635383924562192 \n", - " test-> 0.4703770197486535 \n", - " AUC: \n", - " train-> 0.6010212548732079 \n", - " test-> 0.5227059436913452 \n", - "\n", - "Label Model \n", - " Accuracy: \n", - " train-> 0.5527615626403233 \n", - " test-> 0.49012567324955114 \n", - " AUC: \n", - " train-> 0.524345344386498 \n", - " test-> 0.4437434827945777 \n", - "\n", - "Random Voter Model \n", - " Accuracy: \n", - " train-> 0.5024696901661428 \n", - " test-> 0.5008976660682226 \n", - " AUC: \n", - " train-> 0.5126309212678816 \n", - " test-> 0.5023114355231144 \n", - "\n" - ] - } - ], - "source": [ - "all_models = {'Majority Model':majority_model,\n", - " 'Label Model':label_model,\n", - " 'Random Voter Model':random_voter}\n", - "\n", - "for model_name,model in all_models.items():\n", - " \n", - " #accuracy\n", - " train_acc = model.score(L=label_matrix_train, Y=y_train, tie_break_policy=\"random\")[\"accuracy\"]\n", - " test_acc = model.score(L=label_matrix_test, Y=y_test, tie_break_policy=\"random\")[\"accuracy\"]\n", - " \n", - " #auc\n", - " train_auc = metric_score(y_train, probs=model.predict_proba(L=label_matrix_train), metric='roc_auc')\n", - " test_auc = metric_score(y_test, probs=model.predict_proba(L=label_matrix_test), metric='roc_auc')\n", - " \n", - " print(f'{model_name}','\\n',\n", - " 'Accuracy:','\\n','train->',train_acc,'\\n','test->',test_acc,'\\n',\n", - " 'AUC:','\\n','train->',train_auc,'\\n','test->',test_auc,'\\n')\n", - " \n", - " \n", - " " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The label model has the highest test AUC so that's the best model." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 3. Filter out unlabeled points" - ] - }, - { - "cell_type": "code", - "execution_count": 169, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Total points labelled in training data: 405\n", - "Total points labelled in testing data: 91\n" - ] - } - ], - "source": [ - "#training labels\n", - "df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(\n", - " X=df_train['text'], \n", - " y=lm_preds_proba_train, \n", - " L=label_matrix_train\n", - ")\n", - "\n", - "#testing labels\n", - "df_test_filtered, probs_test_filtered = filter_unlabeled_dataframe(\n", - " X=df_test['text'], \n", - " y=lm_preds_proba_test, \n", - " L=label_matrix_test\n", - ")\n", - "\n", - "print('Total points labelled in training data:',len(df_train_filtered))\n", - "print('Total points labelled in testing data:',len(df_test_filtered))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# References\n", - "\n", - "https://www.snorkel.org/use-cases/spouse-demo\n", - " \n", - "https://github.com/snorkel-team/snorkel-tutorials/blob/master/spouse/spouse_demo.ipynb\n", - " \n", - "https://www.snorkel.org/use-cases/01-spam-tutorial\n", - " \n", - "https://readthedocs.org/projects/snorkel/downloads/pdf/master/" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From 5d3751e9355b1181b5dd98315a9e4ac0a2d915a6 Mon Sep 17 00:00:00 2001 From: kritim13 Date: Wed, 1 Jul 2020 22:42:35 +0530 Subject: [PATCH 3/3] 'snorkel' --- .../Snorkel RE example.ipynb | 1952 +++++++++++++++++ .../snorkel_preprocessing_example.py | 69 + 2 files changed, 2021 insertions(+) create mode 100644 immunology_kg/notebooks/snorkel_re_example/Snorkel RE example.ipynb create mode 100644 immunology_kg/notebooks/snorkel_re_example/snorkel_preprocessing_example.py diff --git a/immunology_kg/notebooks/snorkel_re_example/Snorkel RE example.ipynb b/immunology_kg/notebooks/snorkel_re_example/Snorkel RE example.ipynb new file mode 100644 index 0000000..86b90af --- /dev/null +++ b/immunology_kg/notebooks/snorkel_re_example/Snorkel RE example.ipynb @@ -0,0 +1,1952 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "from snorkel.preprocess.nlp import SpacyPreprocessor\n", + "from snorkel.labeling import PandasLFApplier,filter_unlabeled_dataframe,LFAnalysis ,labeling_function\n", + "from snorkel.labeling.model import MajorityClassVoter,MajorityLabelVoter,RandomVoter ,LabelModel\n", + "\n", + "\n", + "from snorkel.analysis import metric_score , get_label_buckets\n", + "\n", + "from snorkel.utils import probs_to_preds\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "import pandas as pd\n", + "import re\n", + "import os\n", + "from collections import OrderedDict\n", + "\n", + "#importing self-defined helped modules\n", + "from snorkel_preprocessing_example import make_source_target_preprocessor,make_text_between_preprocessor" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Load the data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0textsourcerelationtargetlinkpmc_iddoi_id
00While blocking TPC2 activity by tetrandrine, a...{'(+)-Tetrandrine': {'namespace': 'chebi', 'na...negativeCorrelation{'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...{'annotations': {}, 'citation': {'authors': ['...32221306.0NaN
11Chemoinformatics searches yielded 15 approved ...{'(S)-verapamil': {'namespace': 'chebi', 'name...negativeCorrelation{'hypertension': {'namespace': 'doid', 'name':...{'annotations': {}, 'citation': {'db': 'DOI', ...NaNhttps://doi.org/10.1101/2020.03.22.002386
22Thyroid stimulating hormone and free triiodoth...{\"3,3',5'-triiodothyronine\": {'namespace': 'ch...negativeCorrelation{'COVID-19': {'namespace': 'doid', 'name': 'CO...{'annotations': {'mesh': {'D044967': True}}, '...32217556.0NaN
33Based on these results, we performed virtual d...{\"4'-epidoxorubicin\": {'namespace': 'chebi', '...decreases{'3.4.22.69': {'namespace': 'eccode', 'name': ...{'annotations': {}, 'citation': {'authors': ['...32173287.0NaN
44Doctors can also use a clinically approved bil...{'4-methylumbelliferone': {'namespace': 'chebi...decreases{'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',...{'annotations': {'mesh': {'D008168': True}}, '...32205856.0NaN
55Since Vitamin B3 is highly lung protective, it...{'4-methylumbelliferone': {'namespace': 'chebi...decreases{'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',...{'annotations': {}, 'citation': {'authors': ['...32205856.0NaN
66Doctors can also use a clinically approved bil...{'4-methylumbelliferone': {'namespace': 'chebi...decreases{'inflammatory response': {'namespace': 'go', ...{'annotations': {'mesh': {'D008168': True}}, '...32205856.0NaN
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 text \\\n", + "0 0 While blocking TPC2 activity by tetrandrine, a... \n", + "1 1 Chemoinformatics searches yielded 15 approved ... \n", + "2 2 Thyroid stimulating hormone and free triiodoth... \n", + "3 3 Based on these results, we performed virtual d... \n", + "4 4 Doctors can also use a clinically approved bil... \n", + "5 5 Since Vitamin B3 is highly lung protective, it... \n", + "6 6 Doctors can also use a clinically approved bil... \n", + "\n", + " source relation \\\n", + "0 {'(+)-Tetrandrine': {'namespace': 'chebi', 'na... negativeCorrelation \n", + "1 {'(S)-verapamil': {'namespace': 'chebi', 'name... negativeCorrelation \n", + "2 {\"3,3',5'-triiodothyronine\": {'namespace': 'ch... negativeCorrelation \n", + "3 {\"4'-epidoxorubicin\": {'namespace': 'chebi', '... decreases \n", + "4 {'4-methylumbelliferone': {'namespace': 'chebi... decreases \n", + "5 {'4-methylumbelliferone': {'namespace': 'chebi... decreases \n", + "6 {'4-methylumbelliferone': {'namespace': 'chebi... decreases \n", + "\n", + " target \\\n", + "0 {'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'... \n", + "1 {'hypertension': {'namespace': 'doid', 'name':... \n", + "2 {'COVID-19': {'namespace': 'doid', 'name': 'CO... \n", + "3 {'3.4.22.69': {'namespace': 'eccode', 'name': ... \n", + "4 {'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',... \n", + "5 {'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',... \n", + "6 {'inflammatory response': {'namespace': 'go', ... \n", + "\n", + " link pmc_id \\\n", + "0 {'annotations': {}, 'citation': {'authors': ['... 32221306.0 \n", + "1 {'annotations': {}, 'citation': {'db': 'DOI', ... NaN \n", + "2 {'annotations': {'mesh': {'D044967': True}}, '... 32217556.0 \n", + "3 {'annotations': {}, 'citation': {'authors': ['... 32173287.0 \n", + "4 {'annotations': {'mesh': {'D008168': True}}, '... 32205856.0 \n", + "5 {'annotations': {}, 'citation': {'authors': ['... 32205856.0 \n", + "6 {'annotations': {'mesh': {'D008168': True}}, '... 32205856.0 \n", + "\n", + " doi_id \n", + "0 NaN \n", + "1 https://doi.org/10.1101/2020.03.22.002386 \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "5 NaN \n", + "6 NaN " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#'https://raw.githubusercontent.com/covid19kg/covid19kg/master/supplement/terminology.csv'\n", + "url = 'https://raw.githubusercontent.com/CoronaWhy/task-vt/kaleidoescape_kg/immunology_kg/relations/covid19_frauenhofer_annotations.csv'\n", + "pybel_pd = pd.read_csv(url)\n", + "pybel_pd.head(7)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['negativeCorrelation', 'decreases', 'regulates', 'increases',\n", + " 'positiveCorrelation', 'association', 'isA', 'biomarkerFor',\n", + " 'prognosticBiomarkerFor', 'causesNoChange'], dtype=object)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#list all types of relations\n", + "relation_categories = pybel_pd['relation'].unique()\n", + "relation_categories" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Snorkel Example\n", + "\n", + "For the purpose of this example we'll only focus on rows with 'negativeCorrelation' and 'positiveCorrelation' as their relations." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\Cafral\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:2: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " \n", + "C:\\Users\\Cafral\\Anaconda3\\lib\\site-packages\\pandas\\core\\frame.py:4102: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " errors=errors,\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textsourcerelationtargetlinkpmc_iddoi_id
0While blocking TPC2 activity by tetrandrine, a...{'(+)-Tetrandrine': {'namespace': 'chebi', 'na...True{'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...{'annotations': {}, 'citation': {'authors': ['...32221306.0NaN
1Chemoinformatics searches yielded 15 approved ...{'(S)-verapamil': {'namespace': 'chebi', 'name...True{'hypertension': {'namespace': 'doid', 'name':...{'annotations': {}, 'citation': {'db': 'DOI', ...NaNhttps://doi.org/10.1101/2020.03.22.002386
2Thyroid stimulating hormone and free triiodoth...{\"3,3',5'-triiodothyronine\": {'namespace': 'ch...True{'COVID-19': {'namespace': 'doid', 'name': 'CO...{'annotations': {'mesh': {'D044967': True}}, '...32217556.0NaN
3The administration of methylprednisolone appea...{'6-methylprednisolone': {'namespace': 'chebi'...True{'Death': {'namespace': 'mesh', 'name': 'Death...{'annotations': {'doid': {'11394': True}}, 'ci...32167524.0NaN
4Adverse reactions of IFN-α mainly include low-...{'Interferon alfa-2a': {'namespace': 'chebi', ...False{'Low-grade fever': {'namespace': 'hp', 'name'...{'annotations': {}, 'citation': {'authors': ['...32166483.0NaN
\n", + "
" + ], + "text/plain": [ + " text \\\n", + "0 While blocking TPC2 activity by tetrandrine, a... \n", + "1 Chemoinformatics searches yielded 15 approved ... \n", + "2 Thyroid stimulating hormone and free triiodoth... \n", + "3 The administration of methylprednisolone appea... \n", + "4 Adverse reactions of IFN-α mainly include low-... \n", + "\n", + " source relation \\\n", + "0 {'(+)-Tetrandrine': {'namespace': 'chebi', 'na... True \n", + "1 {'(S)-verapamil': {'namespace': 'chebi', 'name... True \n", + "2 {\"3,3',5'-triiodothyronine\": {'namespace': 'ch... True \n", + "3 {'6-methylprednisolone': {'namespace': 'chebi'... True \n", + "4 {'Interferon alfa-2a': {'namespace': 'chebi', ... False \n", + "\n", + " target \\\n", + "0 {'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'... \n", + "1 {'hypertension': {'namespace': 'doid', 'name':... \n", + "2 {'COVID-19': {'namespace': 'doid', 'name': 'CO... \n", + "3 {'Death': {'namespace': 'mesh', 'name': 'Death... \n", + "4 {'Low-grade fever': {'namespace': 'hp', 'name'... \n", + "\n", + " link pmc_id \\\n", + "0 {'annotations': {}, 'citation': {'authors': ['... 32221306.0 \n", + "1 {'annotations': {}, 'citation': {'db': 'DOI', ... NaN \n", + "2 {'annotations': {'mesh': {'D044967': True}}, '... 32217556.0 \n", + "3 {'annotations': {'doid': {'11394': True}}, 'ci... 32167524.0 \n", + "4 {'annotations': {}, 'citation': {'authors': ['... 32166483.0 \n", + "\n", + " doi_id \n", + "0 NaN \n", + "1 https://doi.org/10.1101/2020.03.22.002386 \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "example_data = pybel_pd[(pybel_pd['relation']=='negativeCorrelation') | (pybel_pd['relation']=='positiveCorrelation') ]\n", + "example_data['relation'] = example_data['relation']=='negativeCorrelation'\n", + "example_data.reset_index(inplace=True,drop=True)\n", + "example_data.drop('Unnamed: 0',inplace=True,axis=1)\n", + "example_data.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.1 Split the data into training and testing \n", + "\n", + "Ideally should have training , validation and testing set. Also, here , I'm using a fixed testing period but k-fold cross validation techniques are a more robust way of determining the accuracy of the generated labels." + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "df_train,df_test,y_train,y_test = train_test_split(example_data[['text']],\n", + " example_data[['relation']],\n", + " test_size=0.20,\n", + " shuffle=False,random_state=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.2 Reading sentences to understand syntactic differences betweem negative and positive correlation sentences\n", + "\n", + "The utility of snorkel is that it allows you to create multiple labelling functions which (try to) mimic the rules that a human annotator of data would apply while deciding how to label unlabelled data. For instance a human annotator looking to identify negative correlation is sentences will follow the following rules:\n", + "\n", + "##### A. Positive rules: rules which tell you what a sentences with negative correlation looks like\n", + "\n", + "1) does the sentence contain the reduction related words like words 'decreased','reduced','lowered'\n", + "\n", + "2) does the sentence contain the expression 'negatively correlated' or 'negative correlation' or 'negatively related' or 'inversely related' or 'inverse relation'\n", + "\n", + "3) does the syntax follow the structure 'increase in x is associated with a decrease in y' (and vice versa)\n", + "\n", + "4) does the sentence contain the expression 'negative effect'\n", + "\n", + "5) does the sentence contain the expression 'move in opposite directions'\n", + "\n", + "##### B. Negative rules: rules which tell you what a sentences without negative correlation looks like\n", + "\n", + "1) does the sentence contain the increase related words like words 'increased','improved'\n", + "\n", + "2) does the sentence contain the expression 'positively correlated' or 'positive correlation' or 'positively related'\n", + "\n", + "3) does the syntax follow the structure 'increase in x is associated with a increase in y' (and vice versa)\n", + "\n", + "4) does the sentence contain the expression 'positive effect'\n", + "\n", + "5) does the sentence contain the expression 'move in the same direction'\n", + "\n", + "These rules can be coded using snorkel. Importantly it requires both positive and negative rules. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2.2.1 Examining Negative correlation sentences to understand their syntactic structure and then define labelling functions accordingly" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textsourcerelationtargetlinkpmc_iddoi_id
0While blocking TPC2 activity by tetrandrine, a...{'(+)-Tetrandrine': {'namespace': 'chebi', 'na...True{'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...{'annotations': {}, 'citation': {'authors': ['...32221306.0NaN
1Chemoinformatics searches yielded 15 approved ...{'(S)-verapamil': {'namespace': 'chebi', 'name...True{'hypertension': {'namespace': 'doid', 'name':...{'annotations': {}, 'citation': {'db': 'DOI', ...NaNhttps://doi.org/10.1101/2020.03.22.002386
2Thyroid stimulating hormone and free triiodoth...{\"3,3',5'-triiodothyronine\": {'namespace': 'ch...True{'COVID-19': {'namespace': 'doid', 'name': 'CO...{'annotations': {'mesh': {'D044967': True}}, '...32217556.0NaN
3The administration of methylprednisolone appea...{'6-methylprednisolone': {'namespace': 'chebi'...True{'Death': {'namespace': 'mesh', 'name': 'Death...{'annotations': {'doid': {'11394': True}}, 'ci...32167524.0NaN
4In our opinion, during the COVID-19 pandemic, ...{'adrenergic antagonist': {'namespace': 'chebi...True{'COVID-19': {'namespace': 'doid', 'name': 'CO...{'annotations': {}, 'citation': {'authors': ['...32220710.0NaN
5Consistent with previous reports, 20mM NH4Cl a...{'ammonium chloride': {'namespace': 'chebi', '...True{'G protein, vesicular stomatitis virus': {'na...{'annotations': {}, 'citation': {'authors': ['...32221306.0NaN
6If the latter percentage would be found to be ...{'angiotensin receptor antagonist': {'namespac...True{'COVID-19': {'namespace': 'doid', 'name': 'CO...{'annotations': {}, 'citation': {'authors': ['...32129518.0NaN
7Consistent with previous reports, 20mM NH4Cl a...{'bafilomycin A1': {'namespace': 'chebi', 'nam...True{'G protein, vesicular stomatitis virus': {'na...{'annotations': {}, 'citation': {'authors': ['...32221306.0NaN
\n", + "
" + ], + "text/plain": [ + " text \\\n", + "0 While blocking TPC2 activity by tetrandrine, a... \n", + "1 Chemoinformatics searches yielded 15 approved ... \n", + "2 Thyroid stimulating hormone and free triiodoth... \n", + "3 The administration of methylprednisolone appea... \n", + "4 In our opinion, during the COVID-19 pandemic, ... \n", + "5 Consistent with previous reports, 20mM NH4Cl a... \n", + "6 If the latter percentage would be found to be ... \n", + "7 Consistent with previous reports, 20mM NH4Cl a... \n", + "\n", + " source relation \\\n", + "0 {'(+)-Tetrandrine': {'namespace': 'chebi', 'na... True \n", + "1 {'(S)-verapamil': {'namespace': 'chebi', 'name... True \n", + "2 {\"3,3',5'-triiodothyronine\": {'namespace': 'ch... True \n", + "3 {'6-methylprednisolone': {'namespace': 'chebi'... True \n", + "4 {'adrenergic antagonist': {'namespace': 'chebi... True \n", + "5 {'ammonium chloride': {'namespace': 'chebi', '... True \n", + "6 {'angiotensin receptor antagonist': {'namespac... True \n", + "7 {'bafilomycin A1': {'namespace': 'chebi', 'nam... True \n", + "\n", + " target \\\n", + "0 {'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'... \n", + "1 {'hypertension': {'namespace': 'doid', 'name':... \n", + "2 {'COVID-19': {'namespace': 'doid', 'name': 'CO... \n", + "3 {'Death': {'namespace': 'mesh', 'name': 'Death... \n", + "4 {'COVID-19': {'namespace': 'doid', 'name': 'CO... \n", + "5 {'G protein, vesicular stomatitis virus': {'na... \n", + "6 {'COVID-19': {'namespace': 'doid', 'name': 'CO... \n", + "7 {'G protein, vesicular stomatitis virus': {'na... \n", + "\n", + " link pmc_id \\\n", + "0 {'annotations': {}, 'citation': {'authors': ['... 32221306.0 \n", + "1 {'annotations': {}, 'citation': {'db': 'DOI', ... NaN \n", + "2 {'annotations': {'mesh': {'D044967': True}}, '... 32217556.0 \n", + "3 {'annotations': {'doid': {'11394': True}}, 'ci... 32167524.0 \n", + "4 {'annotations': {}, 'citation': {'authors': ['... 32220710.0 \n", + "5 {'annotations': {}, 'citation': {'authors': ['... 32221306.0 \n", + "6 {'annotations': {}, 'citation': {'authors': ['... 32129518.0 \n", + "7 {'annotations': {}, 'citation': {'authors': ['... 32221306.0 \n", + "\n", + " doi_id \n", + "0 NaN \n", + "1 https://doi.org/10.1101/2020.03.22.002386 \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "5 NaN \n", + "6 NaN \n", + "7 NaN " + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "neg_correl_df = example_data[example_data['relation']==1]\n", + "neg_correl_df.reset_index(inplace=True,drop=True)\n", + "neg_correl_df.head(8)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'While blocking TPC2 activity by tetrandrine, an inhibitor for TPC237, decreased entry of SARS-CoV-2 S pseudovirions (Fig. 3f), treatment of cells with 130, a TRPML1 inhibitor, had no effect (Supplementary Fig. 1), indicating that TPC2, not TRPML1, is important for SARS-CoV-2 entry.'" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "neg_correl_df['text'][0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2.2.2 Examining positive correlation sentences to understand their syntactic structure and then define labelling funtions accordingly" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textsourcerelationtargetlinkpmc_iddoi_id
0Adverse reactions of IFN-α mainly include low-...{'Interferon alfa-2a': {'namespace': 'chebi', ...False{'Low-grade fever': {'namespace': 'hp', 'name'...{'annotations': {}, 'citation': {'authors': ['...32166483.0NaN
1Adverse reactions of IFN-α mainly include low-...{'Interferon alfa-2a': {'namespace': 'chebi', ...False{'influenza': {'namespace': 'doid', 'name': 'i...{'annotations': {}, 'citation': {'authors': ['...32166483.0NaN
2This may be accounted for by two complementary...{'angiotensin II': {'namespace': 'chebi', 'nam...False{'COVID-19': {'namespace': 'doid', 'name': 'CO...{'annotations': {}, 'citation': {'authors': ['...32129518.0NaN
3ACE2 can also antagonize cardiac fibrosis and ...{'angiotensin II': {'namespace': 'chebi', 'nam...False{'Ventricular Remodeling': {'namespace': 'mesh...{'annotations': {}, 'citation': {'authors': ['...32221983.0NaN
4ACE2 can also antagonize cardiac fibrosis and ...{'angiotensin II': {'namespace': 'chebi', 'nam...False{'Myocardial fibrosis': {'namespace': 'hp', 'n...{'annotations': {}, 'citation': {'authors': ['...32221983.0NaN
5The existence of significantly increased fibri...{'Fibrin': {'namespace': 'chebi', 'name': 'Fib...False{'Hyperfibrinolysis': {'namespace': 'hp', 'nam...{'annotations': {}, 'citation': {'authors': ['...32216698.0NaN
6This opinion is supported by the presence of h...{'Fibrin': {'namespace': 'chebi', 'name': 'Fib...False{'Hemorrhage': {'namespace': 'mesh', 'name': '...{'annotations': {}, 'citation': {'authors': ['...32216698.0NaN
7In the influenza virus model, it was reported ...{'chloroquine': {'namespace': 'chebi', 'name':...False{'dendritic cell antigen processing and presen...{'annotations': {'mesh': {'D007251': True}}, '...32171740.0NaN
\n", + "
" + ], + "text/plain": [ + " text \\\n", + "0 Adverse reactions of IFN-α mainly include low-... \n", + "1 Adverse reactions of IFN-α mainly include low-... \n", + "2 This may be accounted for by two complementary... \n", + "3 ACE2 can also antagonize cardiac fibrosis and ... \n", + "4 ACE2 can also antagonize cardiac fibrosis and ... \n", + "5 The existence of significantly increased fibri... \n", + "6 This opinion is supported by the presence of h... \n", + "7 In the influenza virus model, it was reported ... \n", + "\n", + " source relation \\\n", + "0 {'Interferon alfa-2a': {'namespace': 'chebi', ... False \n", + "1 {'Interferon alfa-2a': {'namespace': 'chebi', ... False \n", + "2 {'angiotensin II': {'namespace': 'chebi', 'nam... False \n", + "3 {'angiotensin II': {'namespace': 'chebi', 'nam... False \n", + "4 {'angiotensin II': {'namespace': 'chebi', 'nam... False \n", + "5 {'Fibrin': {'namespace': 'chebi', 'name': 'Fib... False \n", + "6 {'Fibrin': {'namespace': 'chebi', 'name': 'Fib... False \n", + "7 {'chloroquine': {'namespace': 'chebi', 'name':... False \n", + "\n", + " target \\\n", + "0 {'Low-grade fever': {'namespace': 'hp', 'name'... \n", + "1 {'influenza': {'namespace': 'doid', 'name': 'i... \n", + "2 {'COVID-19': {'namespace': 'doid', 'name': 'CO... \n", + "3 {'Ventricular Remodeling': {'namespace': 'mesh... \n", + "4 {'Myocardial fibrosis': {'namespace': 'hp', 'n... \n", + "5 {'Hyperfibrinolysis': {'namespace': 'hp', 'nam... \n", + "6 {'Hemorrhage': {'namespace': 'mesh', 'name': '... \n", + "7 {'dendritic cell antigen processing and presen... \n", + "\n", + " link pmc_id doi_id \n", + "0 {'annotations': {}, 'citation': {'authors': ['... 32166483.0 NaN \n", + "1 {'annotations': {}, 'citation': {'authors': ['... 32166483.0 NaN \n", + "2 {'annotations': {}, 'citation': {'authors': ['... 32129518.0 NaN \n", + "3 {'annotations': {}, 'citation': {'authors': ['... 32221983.0 NaN \n", + "4 {'annotations': {}, 'citation': {'authors': ['... 32221983.0 NaN \n", + "5 {'annotations': {}, 'citation': {'authors': ['... 32216698.0 NaN \n", + "6 {'annotations': {}, 'citation': {'authors': ['... 32216698.0 NaN \n", + "7 {'annotations': {'mesh': {'D007251': True}}, '... 32171740.0 NaN " + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "positive_relation_df = example_data[example_data['relation']==0]\n", + "positive_relation_df.reset_index(inplace=True,drop=True)\n", + "positive_relation_df.head(8)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Adverse reactions of IFN-α mainly include low-grade fever and flu-like symptoms (both in children with intramuscularly injection) [11].'" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "positive_relation_df['text'][0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.3 Source-Target dictionary" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A simple but clean rule for identifying negative correlation sentences would be if negative tokens occured in the words between the source and the target. So, a source-target dictonary is created for some of the examples (in the final pipeline the source target dictonary will be obtained from the spacy pipeline)." + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('tetrandrine', 'TPC2'),\n", + " ('triiodothyronine', 'recovered'),\n", + " ('methylprednisolone', 'death'),\n", + " ('IFN-α', 'fever'),\n", + " ('angiotensin', 'vasodilator'),\n", + " ('ACE2', 'Ang'),\n", + " ('fibrin', 'COVID-19'),\n", + " ('hemorrhage', 'fibrinolysis'),\n", + " ('chloroquine', 'dendritic')]" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "example_sources = ['tetrandrine','triiodothyronine','methylprednisolone','IFN-α','angiotensin','ACE2','fibrin','hemorrhage','chloroquine']\n", + "\n", + "example_targets = ['TPC2','recovered','death','fever','vasodilator','Ang','COVID-19','fibrinolysis','dendritic'] #low-grade fever\n", + "\n", + "example_source_target_dict = list(OrderedDict.fromkeys(zip(example_sources,example_targets)))\n", + "example_source_target_dict" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.4 Labeling functions for RE" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "spacy = SpacyPreprocessor(text_field=\"text\", doc_field=\"doc\", memoize=True)\n", + "\n", + "ABSTAIN = -1\n", + "NOT_FOUND = 0\n", + "FOUND = 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2.4.1 Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sentence: While blocking TPC2 activity by tetrandrine, an inhibitor for TPC237, decreased entry of SARS-CoV-2 S pseudovirions (Fig. 3f), treatment of cells with 130, a TRPML1 inhibitor, had no effect (Supplementary Fig. 1), indicating that TPC2, not TRPML1, is important for SARS-CoV-2 entry. \n", + "\n", + "Source-target pair: ('tetrandrine', 'TPC2')\n" + ] + } + ], + "source": [ + "get_source_target = make_source_target_preprocessor(spacy, example_sources, example_targets)\n", + "\n", + "candidate = example_data.loc[0]\n", + "candidate_with_function_applied = get_source_target(candidate) \n", + "\n", + "print(\"Sentence: \", candidate[\"text\"],'\\n',)\n", + "print(\"Source-target pair: \", candidate_with_function_applied.source_target)" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sentence: Thyroid stimulating hormone and free triiodothyronine concentrations were significantly lower in deceased patients (0.7 mIU/mL and 2.8 pmol/L) than in recovered patients (1.4 mIU/mL and 4.3 pmol/L). \n", + "\n", + "Text Between: triiodothyronine concentrations were significantly lower in deceased patients (0.7 mIU/mL and 2.8 pmol/L) than in \n", + "\n", + "Text to the left: Thyroid stimulating hormone and free \n", + "\n" + ] + } + ], + "source": [ + "get_text_between = make_text_between_preprocessor(spacy, example_sources, example_targets)\n", + "\n", + "############ function example ###############################\n", + "candidate = example_data.loc[2]\n", + "\n", + "candidate_with_function_applied = get_text_between(candidate)\n", + "\n", + "print(\"Sentence: \", candidate[\"text\"],'\\n')\n", + "print(\"Text Between: \", candidate_with_function_applied.text_between,'\\n')\n", + "print(\"Text to the left: \", candidate_with_function_applied.text_to_source_left,'\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2.4.2 Defining the labelling functions (lf)\n", + "\n", + "For the final labelling model to work, at least 3 rules are needed.\n", + "\n", + "##### A. Positive rules: rules which tell you what a sentences with negative correlation looks like\n", + " \n", + "1) does the sentence contain the reduction related words like words 'decreased','reduced','lowered'" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "reduction_tokens = {'decreased',\n", + " 'lower',\n", + " 'reduced',\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "@labeling_function(pre=[spacy],resources=dict(reduction_tokens=reduction_tokens))\n", + "def contains_reduction_tokens(x,reduction_tokens):\n", + " \n", + " tokens = [str(token) for token in x.doc]\n", + " return FOUND if len(reduction_tokens.intersection(set(tokens))) > 0 else ABSTAIN\n", + "\n", + "#positive rule - version 2\n", + "@labeling_function(pre=[spacy,get_text_between],resources=dict(reduction_tokens=reduction_tokens))\n", + "def contains_reduction_tokens_text_between(x,reduction_tokens):\n", + " relation_text = x.text_between\n", + " relation_text_tokens = [str(token) for token in relation_text]\n", + " return FOUND if len(reduction_tokens.intersection(set(relation_text_tokens))) > 0 else ABSTAIN" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2) does the sentence contain the expression 'negatively correlated' or 'negative correlation' or 'negatively related' or 'inversely related' or 'inverse relation'" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "negative_correlation_regex_1 = 'negative correlation'\n", + "negative_correlation_regex_2 = 'negatively correlated'\n", + "negative_correlation_regex_3 = 'negatively related'\n", + "negative_correlation_regex_4 = 'inversely related'\n", + "negative_correlation_regex_5 = 'inverse relation'\n", + "negative_correlation_regex_6 = 'negative effect'\n", + "negative_correlation_regex_7 = 'move in opposite directions'\n", + "\n", + "@labeling_function()\n", + "def contains_negative_corrrelation_regex(x):\n", + " if re.search(negative_correlation_regex_1, x.text, flags=re.I):\n", + " return FOUND\n", + " elif re.search(negative_correlation_regex_2, x.text, flags=re.I):\n", + " return FOUND\n", + " elif re.search(negative_correlation_regex_3, x.text, flags=re.I):\n", + " return FOUND\n", + " elif re.search(negative_correlation_regex_4, x.text, flags=re.I):\n", + " return FOUND\n", + " elif re.search(negative_correlation_regex_5, x.text, flags=re.I):\n", + " return FOUND\n", + " elif re.search(negative_correlation_regex_6, x.text, flags=re.I):\n", + " return FOUND\n", + " elif re.search(negative_correlation_regex_7, x.text, flags=re.I):\n", + " return FOUND\n", + "\n", + " else: \n", + " return ABSTAIN\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3) does the syntax follow the structure 'increase in x is associated with a decrease in y' (and vice versa)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "@labeling_function(pre=[spacy,get_text_between])\n", + "def contains_increase_decrease_pattern(x):\n", + " if ('increase' in [str(token) for token in x.text_to_source_left]) & ('decrease' in [str(token) for token in x.text_between]):\n", + " return FOUND\n", + " elif ('decrease' in [str(token) for token in x.text_to_source_left]) & ('increase' in [str(token) for token in x.text_between]):\n", + " return FOUND\n", + " else:\n", + " return ABSTAIN" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### B. Negative rules: rules which tell you what a sentences without negative correlation looks like\n", + "\n", + "1) does the sentence contain the increase related words like words 'increased','higher'" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [], + "source": [ + "increase_tokens = {'increased',\n", + " 'higher',\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "@labeling_function(pre=[spacy],resources=dict(increase_tokens=increase_tokens))\n", + "def contains_increase_tokens(x,increase_tokens):\n", + " tokens = [str(token) for token in x.doc]\n", + " return NOT_FOUND if len(increase_tokens.intersection(set(tokens))) > 0 else ABSTAIN\n", + "\n", + "\n", + "@labeling_function(pre=[spacy,get_text_between],resources=dict(increase_tokens=increase_tokens))\n", + "def contains_increase_tokens_text_between(x, increase_tokens):\n", + " relation_text = x.text_between\n", + " relation_text_tokens = [str(token) for token in relation_text]\n", + " return NOT_FOUND if len(increase_tokens.intersection(set(relation_text_tokens))) > 0 else ABSTAIN" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2) does the sentence contain the expression 'positively correlated' or 'positive correlation' or 'positively related' or 'positive effect' or 'move in the same direction'" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [], + "source": [ + "#regex\n", + "positive_correlation_regex_1 = 'positive correlation'\n", + "positive_correlation_regex_2 = 'positively correlated'\n", + "positive_correlation_regex_3 = 'positively related'\n", + "positive_correlation_regex_4 = 'positive effect'\n", + "positive_correlation_regex_5 = 'move in the same direction'\n", + "\n", + "@labeling_function()\n", + "def contains_positive_corrrelation_regex(x):\n", + " if re.search(positive_correlation_regex_1, x.text, flags=re.I):\n", + " return FOUND\n", + " elif re.search(positive_correlation_regex_2, x.text, flags=re.I):\n", + " return FOUND\n", + " elif re.search(positive_correlation_regex_3, x.text, flags=re.I):\n", + " return FOUND\n", + " elif re.search(positive_correlation_regex_4, x.text, flags=re.I):\n", + " return FOUND\n", + " elif re.search(positive_correlation_regex_5, x.text, flags=re.I):\n", + " return FOUND \n", + " else:\n", + " return ABSTAIN" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3) does the syntax follow the structure 'increase in x is associated with a increase in y' (and vice versa)" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [], + "source": [ + "@labeling_function(pre=[spacy,get_text_between])\n", + "def contains_increase_increase_pattern(x):\n", + " if ('increase' in [str(token) for token in x.text_to_source_left]) & ('increase' in [str(token) for token in x.text_between]):\n", + " return FOUND\n", + " elif ('decrease' in [str(token) for token in x.text_to_source_left]) & ('decrease' in [str(token) for token in x.text_between]):\n", + " return FOUND\n", + " else:\n", + " return ABSTAIN" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.5 Creating all the labels for the different rules" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\Cafral\\Anaconda3\\lib\\site-packages\\tqdm\\std.py:658: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n", + " from pandas import Panel\n", + "100%|█████████████████████████████████████████████████████████████████████████████| 2227/2227 [00:10<00:00, 213.19it/s]\n" + ] + } + ], + "source": [ + "label_functions_list = [contains_reduction_tokens,\n", + " contains_reduction_tokens_text_between,\n", + " contains_negative_corrrelation_regex,\n", + " contains_increase_decrease_pattern,\n", + " contains_increase_tokens,\n", + " contains_increase_tokens_text_between,\n", + " contains_positive_corrrelation_regex,\n", + " contains_increase_increase_pattern\n", + " ]\n", + "\n", + "applier = PandasLFApplier(label_functions_list)\n", + "\n", + "label_matrix_train = applier.apply(df_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.6 Examining the quality of the labels" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
jPolarityCoverageOverlapsConflicts
contains_reduction_tokens0[1]0.0570270.0193080.016165
contains_reduction_tokens_text_between1[1]0.0040410.0040410.000898
contains_negative_corrrelation_regex2[1]0.0085320.0000000.000000
contains_increase_decrease_pattern3[]0.0000000.0000000.000000
contains_increase_tokens4[0]0.1302200.0161650.016165
contains_increase_tokens_text_between5[]0.0000000.0000000.000000
contains_positive_corrrelation_regex6[1]0.0022450.0000000.000000
contains_increase_increase_pattern7[]0.0000000.0000000.000000
\n", + "
" + ], + "text/plain": [ + " j Polarity Coverage Overlaps \\\n", + "contains_reduction_tokens 0 [1] 0.057027 0.019308 \n", + "contains_reduction_tokens_text_between 1 [1] 0.004041 0.004041 \n", + "contains_negative_corrrelation_regex 2 [1] 0.008532 0.000000 \n", + "contains_increase_decrease_pattern 3 [] 0.000000 0.000000 \n", + "contains_increase_tokens 4 [0] 0.130220 0.016165 \n", + "contains_increase_tokens_text_between 5 [] 0.000000 0.000000 \n", + "contains_positive_corrrelation_regex 6 [1] 0.002245 0.000000 \n", + "contains_increase_increase_pattern 7 [] 0.000000 0.000000 \n", + "\n", + " Conflicts \n", + "contains_reduction_tokens 0.016165 \n", + "contains_reduction_tokens_text_between 0.000898 \n", + "contains_negative_corrrelation_regex 0.000000 \n", + "contains_increase_decrease_pattern 0.000000 \n", + "contains_increase_tokens 0.016165 \n", + "contains_increase_tokens_text_between 0.000000 \n", + "contains_positive_corrrelation_regex 0.000000 \n", + "contains_increase_increase_pattern 0.000000 " + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#in the absence of a benchmark to compare against\n", + "LFAnalysis(L=label_matrix_train, lfs=label_functions_list).lf_summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
jPolarityCoverageOverlapsConflictsCorrectIncorrectEmp. Acc.
contains_reduction_tokens0[1]0.0570270.0193080.01616569580.543307
contains_reduction_tokens_text_between1[1]0.0040410.0040410.000898810.888889
contains_negative_corrrelation_regex2[1]0.0085320.0000000.0000001810.947368
contains_increase_decrease_pattern3[]0.0000000.0000000.000000000.000000
contains_increase_tokens4[0]0.1302200.0161650.016165263270.906897
contains_increase_tokens_text_between5[]0.0000000.0000000.000000000.000000
contains_positive_corrrelation_regex6[1]0.0022450.0000000.000000050.000000
contains_increase_increase_pattern7[]0.0000000.0000000.000000000.000000
\n", + "
" + ], + "text/plain": [ + " j Polarity Coverage Overlaps \\\n", + "contains_reduction_tokens 0 [1] 0.057027 0.019308 \n", + "contains_reduction_tokens_text_between 1 [1] 0.004041 0.004041 \n", + "contains_negative_corrrelation_regex 2 [1] 0.008532 0.000000 \n", + "contains_increase_decrease_pattern 3 [] 0.000000 0.000000 \n", + "contains_increase_tokens 4 [0] 0.130220 0.016165 \n", + "contains_increase_tokens_text_between 5 [] 0.000000 0.000000 \n", + "contains_positive_corrrelation_regex 6 [1] 0.002245 0.000000 \n", + "contains_increase_increase_pattern 7 [] 0.000000 0.000000 \n", + "\n", + " Conflicts Correct Incorrect \\\n", + "contains_reduction_tokens 0.016165 69 58 \n", + "contains_reduction_tokens_text_between 0.000898 8 1 \n", + "contains_negative_corrrelation_regex 0.000000 18 1 \n", + "contains_increase_decrease_pattern 0.000000 0 0 \n", + "contains_increase_tokens 0.016165 263 27 \n", + "contains_increase_tokens_text_between 0.000000 0 0 \n", + "contains_positive_corrrelation_regex 0.000000 0 5 \n", + "contains_increase_increase_pattern 0.000000 0 0 \n", + "\n", + " Emp. Acc. \n", + "contains_reduction_tokens 0.543307 \n", + "contains_reduction_tokens_text_between 0.888889 \n", + "contains_negative_corrrelation_regex 0.947368 \n", + "contains_increase_decrease_pattern 0.000000 \n", + "contains_increase_tokens 0.906897 \n", + "contains_increase_tokens_text_between 0.000000 \n", + "contains_positive_corrrelation_regex 0.000000 \n", + "contains_increase_increase_pattern 0.000000 " + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#examining the quality of the labels in the presence of a benchmark to compare against\n", + "LFAnalysis(L=label_matrix_train, lfs=label_functions_list).lf_summary(y_train.values)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
text
0While blocking TPC2 activity by tetrandrine, a...
2Thyroid stimulating hormone and free triiodoth...
3The administration of methylprednisolone appea...
7Consistent with previous reports, 20mM NH4Cl a...
12Consistent with previous reports, 20mM NH4Cl a...
......
1644Actual bicarbonate and total carbon dioxide co...
1655Albumin concentrations were significantly lowe...
1657Moreover, the frequencies of regulatory T cell...
1658The reduced expressions of interferon-γ (IFN-γ...
1668Spleen atrophy was observed in all reported ca...
\n", + "

127 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " text\n", + "0 While blocking TPC2 activity by tetrandrine, a...\n", + "2 Thyroid stimulating hormone and free triiodoth...\n", + "3 The administration of methylprednisolone appea...\n", + "7 Consistent with previous reports, 20mM NH4Cl a...\n", + "12 Consistent with previous reports, 20mM NH4Cl a...\n", + "... ...\n", + "1644 Actual bicarbonate and total carbon dioxide co...\n", + "1655 Albumin concentrations were significantly lowe...\n", + "1657 Moreover, the frequencies of regulatory T cell...\n", + "1658 The reduced expressions of interferon-γ (IFN-γ...\n", + "1668 Spleen atrophy was observed in all reported ca...\n", + "\n", + "[127 rows x 1 columns]" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#examine which sentences were picked up as showing negative correlation by each label function\n", + "df_train.iloc[label_matrix_train[:, 0] == FOUND]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.7 Predict the final label\n", + "\n", + "Different models can be used to create the final model that aggrateges the different label functions to perdict the final lebel." + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\Cafral\\Anaconda3\\lib\\site-packages\\tqdm\\std.py:658: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n", + " from pandas import Panel\n", + "100%|███████████████████████████████████████████████████████████████████████████████| 557/557 [00:01<00:00, 282.64it/s]\n" + ] + } + ], + "source": [ + "#testing data\n", + "label_matrix_test = applier.apply(df_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [], + "source": [ + "#Model 1 : majority model (mm)\n", + "majority_model = MajorityLabelVoter()\n", + "\n", + "#training data\n", + "mm_preds_class_train = majority_model.predict(L=label_matrix_train)\n", + "mm_preds_proba_train = majority_model.predict_proba(L=label_matrix_train)\n", + "\n", + "#testing data\n", + "mm_preds_class_test = majority_model.predict(L=label_matrix_test)\n", + "mm_preds_proba_test = majority_model.predict_proba(L=label_matrix_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 1, -1, 1, ..., -1, -1, -1])" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mm_preds_class_train # only the 1s and 0s are labels. T-1s are abstains i.e. unlabeled data points" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0. , 1. ],\n", + " [0.5, 0.5],\n", + " [0. , 1. ],\n", + " ...,\n", + " [0.5, 0.5],\n", + " [0.5, 0.5],\n", + " [0.5, 0.5]])" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mm_preds_proba_train" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [], + "source": [ + "# Model 2:label model (lm)\n", + "\n", + "#call the model\n", + "label_model = LabelModel(cardinality=2, verbose=True)\n", + "\n", + "#fit the model\n", + "num_epochs = 1000\n", + "log_frequency = 100\n", + "random_seed = 1\n", + "label_model.fit(L_train=label_matrix_train, n_epochs=num_epochs, log_freq=log_frequency, seed=random_seed)\n", + "\n", + "#generate lables for training data\n", + "lm_preds_proba_train = label_model.predict_proba(label_matrix_train)\n", + "lm_preds_class_train = probs_to_preds(lm_preds_proba_train)\n", + "\n", + "#generate labels for testing data\n", + "lm_preds_proba_test = label_model.predict_proba(label_matrix_test)\n", + "lm_preds_class_test = probs_to_preds(lm_preds_proba_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [], + "source": [ + "# Model 3 : Random Voter (rv)\n", + "\n", + "random_voter = RandomVoter()\n", + "\n", + "#training data\n", + "rv_preds_class_train = random_voter.predict(L=label_matrix_train)\n", + "rv_preds_proba_train = random_voter.predict_proba(L=label_matrix_train)\n", + "\n", + "#testing data\n", + "rv_preds_class_test = random_voter.predict(L=label_matrix_test)\n", + "rv_preds_proba_test = random_voter.predict_proba(L=label_matrix_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.8 Comparing different models" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Majority Model \n", + " Accuracy: \n", + " train-> 0.5635383924562192 \n", + " test-> 0.4703770197486535 \n", + " AUC: \n", + " train-> 0.6010212548732079 \n", + " test-> 0.5227059436913452 \n", + "\n", + "Label Model \n", + " Accuracy: \n", + " train-> 0.5527615626403233 \n", + " test-> 0.49012567324955114 \n", + " AUC: \n", + " train-> 0.524345344386498 \n", + " test-> 0.4437434827945777 \n", + "\n", + "Random Voter Model \n", + " Accuracy: \n", + " train-> 0.5024696901661428 \n", + " test-> 0.5008976660682226 \n", + " AUC: \n", + " train-> 0.5126309212678816 \n", + " test-> 0.5023114355231144 \n", + "\n" + ] + } + ], + "source": [ + "all_models = {'Majority Model':majority_model,\n", + " 'Label Model':label_model,\n", + " 'Random Voter Model':random_voter}\n", + "\n", + "for model_name,model in all_models.items():\n", + " \n", + " #accuracy\n", + " train_acc = model.score(L=label_matrix_train, Y=y_train, tie_break_policy=\"random\")[\"accuracy\"]\n", + " test_acc = model.score(L=label_matrix_test, Y=y_test, tie_break_policy=\"random\")[\"accuracy\"]\n", + " \n", + " #auc\n", + " train_auc = metric_score(y_train, probs=model.predict_proba(L=label_matrix_train), metric='roc_auc')\n", + " test_auc = metric_score(y_test, probs=model.predict_proba(L=label_matrix_test), metric='roc_auc')\n", + " \n", + " print(f'{model_name}','\\n',\n", + " 'Accuracy:','\\n','train->',train_acc,'\\n','test->',test_acc,'\\n',\n", + " 'AUC:','\\n','train->',train_auc,'\\n','test->',test_auc,'\\n')\n", + " \n", + " \n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The label model has the highest test AUC so that's the best model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Filter out unlabeled points" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total points labelled in training data: 405\n", + "Total points labelled in testing data: 91\n" + ] + } + ], + "source": [ + "#training labels\n", + "df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(\n", + " X=df_train['text'], \n", + " y=mm_preds_proba_train, \n", + " L=label_matrix_train\n", + ")\n", + "\n", + "#testing labels\n", + "df_test_filtered, probs_test_filtered = filter_unlabeled_dataframe(\n", + " X=df_test['text'], \n", + " y=mm_preds_proba_test, \n", + " L=label_matrix_test\n", + ")\n", + "\n", + "print('Total points labelled in training data:',len(df_train_filtered))\n", + "print('Total points labelled in testing data:',len(df_test_filtered))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# References\n", + "\n", + "https://www.snorkel.org/use-cases/spouse-demo\n", + " \n", + "https://github.com/snorkel-team/snorkel-tutorials/blob/master/spouse/spouse_demo.ipynb\n", + " \n", + "https://www.snorkel.org/use-cases/01-spam-tutorial\n", + " \n", + "https://readthedocs.org/projects/snorkel/downloads/pdf/master/" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/immunology_kg/notebooks/snorkel_re_example/snorkel_preprocessing_example.py b/immunology_kg/notebooks/snorkel_re_example/snorkel_preprocessing_example.py new file mode 100644 index 0000000..0183140 --- /dev/null +++ b/immunology_kg/notebooks/snorkel_re_example/snorkel_preprocessing_example.py @@ -0,0 +1,69 @@ +import snorkel + +from snorkel.preprocess import preprocessor +from snorkel.preprocess.nlp import SpacyPreprocessor +from snorkel.types import DataPoint + +spacy = SpacyPreprocessor(text_field="text", doc_field="doc", memoize=True) + +def make_source_target_preprocessor(spacy, sources, targets): + @preprocessor(pre=[spacy]) + def get_source_target(cand: DataPoint) -> DataPoint: + """Returnsthe source and target mentioned in the sentence.""" + person_names = [] + + source = [token.text for token in cand.doc if token.text in sources] + target = [token.text for token in cand.doc if token.text in targets] + + try: + cand.source_target = (source[0], target[0]) + except: + cand.source_target = (np.nan, np.nan) + return cand + return get_source_target + +def make_text_between_preprocessor(spacy, sources, targets): + @preprocessor(pre=[spacy]) + def get_text_between(cand: DataPoint) -> DataPoint: + """ + Returns the text between a source-target pair and the text to the left of the source + """ + + source_idx = [token.i for token in cand.doc if token.text in sources] + target_idx = [token.i for token in cand.doc if token.text in targets] + + try: + + if (len(target_idx)==1) & (len(source_idx)==1) & (source_idx[0]1) & (len(source_idx)==1): + for target_index in target_idx: + if source_idx[0]1) & (len(target_idx)==1): + for source_index in source_idx: + if source_index1) & (len(target_idx)>1): + for source_index in source_idx: + for target_index in target_idx: + if source_index