From 024dd8de8c1dc0d05cbe3407b49f8b804630421c Mon Sep 17 00:00:00 2001
From: Kriti Mahajan <kriti.mahajan.13@gmail.com>
Date: Mon, 29 Jun 2020 21:55:40 +0530
Subject: [PATCH 1/3] Add files via upload

---
 .../notebooks/Snorkel RE example.ipynb        | 2187 +++++++++++++++++
 1 file changed, 2187 insertions(+)
 create mode 100644 immunology_kg/notebooks/Snorkel RE example.ipynb
diff --git a/immunology_kg/notebooks/Snorkel RE example.ipynb b/immunology_kg/notebooks/Snorkel RE example.ipynb
new file mode 100644
index 0000000..d44c4a9
--- /dev/null
+++ b/immunology_kg/notebooks/Snorkel RE example.ipynb	
@@ -0,0 +1,2187 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import snorkel\n",
+    "\n",
+    "from snorkel.preprocess import preprocessor\n",
+    "from snorkel.preprocess.nlp import SpacyPreprocessor\n",
+    "from snorkel.types import DataPoint\n",
+    "\n",
+    "from snorkel.labeling.lf.nlp import nlp_labeling_function\n",
+    "from snorkel.labeling import PandasLFApplier,filter_unlabeled_dataframe,LFAnalysis ,labeling_function\n",
+    "from snorkel.labeling.model import MajorityClassVoter,MajorityLabelVoter,RandomVoter ,LabelModel\n",
+    "\n",
+    "from snorkel.analysis import metric_score , get_label_buckets\n",
+    "\n",
+    "from snorkel.utils import probs_to_preds\n",
+    "\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "import pandas as pd\n",
+    "import re\n",
+    "import os\n",
+    "from collections import OrderedDict"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 1. Load the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Unnamed: 0</th>\n",
+       "      <th>text</th>\n",
+       "      <th>source</th>\n",
+       "      <th>relation</th>\n",
+       "      <th>target</th>\n",
+       "      <th>link</th>\n",
+       "      <th>pmc_id</th>\n",
+       "      <th>doi_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>While blocking TPC2 activity by tetrandrine, a...</td>\n",
+       "      <td>{'(+)-Tetrandrine': {'namespace': 'chebi', 'na...</td>\n",
+       "      <td>negativeCorrelation</td>\n",
+       "      <td>{'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32221306.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Chemoinformatics searches yielded 15 approved ...</td>\n",
+       "      <td>{'(S)-verapamil': {'namespace': 'chebi', 'name...</td>\n",
+       "      <td>negativeCorrelation</td>\n",
+       "      <td>{'hypertension': {'namespace': 'doid', 'name':...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'db': 'DOI', ...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>https://doi.org/10.1101/2020.03.22.002386</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>Thyroid stimulating hormone and free triiodoth...</td>\n",
+       "      <td>{\"3,3',5'-triiodothyronine\": {'namespace': 'ch...</td>\n",
+       "      <td>negativeCorrelation</td>\n",
+       "      <td>{'COVID-19': {'namespace': 'doid', 'name': 'CO...</td>\n",
+       "      <td>{'annotations': {'mesh': {'D044967': True}}, '...</td>\n",
+       "      <td>32217556.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "      <td>Based on these results, we performed virtual d...</td>\n",
+       "      <td>{\"4'-epidoxorubicin\": {'namespace': 'chebi', '...</td>\n",
+       "      <td>decreases</td>\n",
+       "      <td>{'3.4.22.69': {'namespace': 'eccode', 'name': ...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32173287.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>4</td>\n",
+       "      <td>Doctors can also use a clinically approved bil...</td>\n",
+       "      <td>{'4-methylumbelliferone': {'namespace': 'chebi...</td>\n",
+       "      <td>decreases</td>\n",
+       "      <td>{'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',...</td>\n",
+       "      <td>{'annotations': {'mesh': {'D008168': True}}, '...</td>\n",
+       "      <td>32205856.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5</td>\n",
+       "      <td>5</td>\n",
+       "      <td>Since Vitamin B3 is highly lung protective, it...</td>\n",
+       "      <td>{'4-methylumbelliferone': {'namespace': 'chebi...</td>\n",
+       "      <td>decreases</td>\n",
+       "      <td>{'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32205856.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>6</td>\n",
+       "      <td>6</td>\n",
+       "      <td>Doctors can also use a clinically approved bil...</td>\n",
+       "      <td>{'4-methylumbelliferone': {'namespace': 'chebi...</td>\n",
+       "      <td>decreases</td>\n",
+       "      <td>{'inflammatory response': {'namespace': 'go', ...</td>\n",
+       "      <td>{'annotations': {'mesh': {'D008168': True}}, '...</td>\n",
+       "      <td>32205856.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Unnamed: 0                                               text  \\\n",
+       "0           0  While blocking TPC2 activity by tetrandrine, a...   \n",
+       "1           1  Chemoinformatics searches yielded 15 approved ...   \n",
+       "2           2  Thyroid stimulating hormone and free triiodoth...   \n",
+       "3           3  Based on these results, we performed virtual d...   \n",
+       "4           4  Doctors can also use a clinically approved bil...   \n",
+       "5           5  Since Vitamin B3 is highly lung protective, it...   \n",
+       "6           6  Doctors can also use a clinically approved bil...   \n",
+       "\n",
+       "                                              source             relation  \\\n",
+       "0  {'(+)-Tetrandrine': {'namespace': 'chebi', 'na...  negativeCorrelation   \n",
+       "1  {'(S)-verapamil': {'namespace': 'chebi', 'name...  negativeCorrelation   \n",
+       "2  {\"3,3',5'-triiodothyronine\": {'namespace': 'ch...  negativeCorrelation   \n",
+       "3  {\"4'-epidoxorubicin\": {'namespace': 'chebi', '...            decreases   \n",
+       "4  {'4-methylumbelliferone': {'namespace': 'chebi...            decreases   \n",
+       "5  {'4-methylumbelliferone': {'namespace': 'chebi...            decreases   \n",
+       "6  {'4-methylumbelliferone': {'namespace': 'chebi...            decreases   \n",
+       "\n",
+       "                                              target  \\\n",
+       "0  {'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...   \n",
+       "1  {'hypertension': {'namespace': 'doid', 'name':...   \n",
+       "2  {'COVID-19': {'namespace': 'doid', 'name': 'CO...   \n",
+       "3  {'3.4.22.69': {'namespace': 'eccode', 'name': ...   \n",
+       "4  {'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',...   \n",
+       "5  {'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',...   \n",
+       "6  {'inflammatory response': {'namespace': 'go', ...   \n",
+       "\n",
+       "                                                link      pmc_id  \\\n",
+       "0  {'annotations': {}, 'citation': {'authors': ['...  32221306.0   \n",
+       "1  {'annotations': {}, 'citation': {'db': 'DOI', ...         NaN   \n",
+       "2  {'annotations': {'mesh': {'D044967': True}}, '...  32217556.0   \n",
+       "3  {'annotations': {}, 'citation': {'authors': ['...  32173287.0   \n",
+       "4  {'annotations': {'mesh': {'D008168': True}}, '...  32205856.0   \n",
+       "5  {'annotations': {}, 'citation': {'authors': ['...  32205856.0   \n",
+       "6  {'annotations': {'mesh': {'D008168': True}}, '...  32205856.0   \n",
+       "\n",
+       "                                      doi_id  \n",
+       "0                                        NaN  \n",
+       "1  https://doi.org/10.1101/2020.03.22.002386  \n",
+       "2                                        NaN  \n",
+       "3                                        NaN  \n",
+       "4                                        NaN  \n",
+       "5                                        NaN  \n",
+       "6                                        NaN  "
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#'https://raw.githubusercontent.com/covid19kg/covid19kg/master/supplement/terminology.csv'\n",
+    "url = 'https://raw.githubusercontent.com/CoronaWhy/task-vt/kaleidoescape_kg/immunology_kg/relations/covid19_frauenhofer_annotations.csv'\n",
+    "pybel_pd = pd.read_csv(url)\n",
+    "pybel_pd.head(7)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['negativeCorrelation', 'decreases', 'regulates', 'increases',\n",
+       "       'positiveCorrelation', 'association', 'isA', 'biomarkerFor',\n",
+       "       'prognosticBiomarkerFor', 'causesNoChange'], dtype=object)"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#list all types of relations\n",
+    "relation_categories = pybel_pd['relation'].unique()\n",
+    "relation_categories"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 2. Snorkel Example\n",
+    "\n",
+    "For the purpose of this example we'll only focus on rows with 'negativeCorrelation' and 'positiveCorrelation' as their relations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 131,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\Cafral\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  \n",
+      "C:\\Users\\Cafral\\Anaconda3\\lib\\site-packages\\pandas\\core\\frame.py:4102: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  errors=errors,\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>text</th>\n",
+       "      <th>source</th>\n",
+       "      <th>relation</th>\n",
+       "      <th>target</th>\n",
+       "      <th>link</th>\n",
+       "      <th>pmc_id</th>\n",
+       "      <th>doi_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>0</td>\n",
+       "      <td>While blocking TPC2 activity by tetrandrine, a...</td>\n",
+       "      <td>{'(+)-Tetrandrine': {'namespace': 'chebi', 'na...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>{'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32221306.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>Chemoinformatics searches yielded 15 approved ...</td>\n",
+       "      <td>{'(S)-verapamil': {'namespace': 'chebi', 'name...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>{'hypertension': {'namespace': 'doid', 'name':...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'db': 'DOI', ...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>https://doi.org/10.1101/2020.03.22.002386</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>Thyroid stimulating hormone and free triiodoth...</td>\n",
+       "      <td>{\"3,3',5'-triiodothyronine\": {'namespace': 'ch...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>{'COVID-19': {'namespace': 'doid', 'name': 'CO...</td>\n",
+       "      <td>{'annotations': {'mesh': {'D044967': True}}, '...</td>\n",
+       "      <td>32217556.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>The administration of methylprednisolone appea...</td>\n",
+       "      <td>{'6-methylprednisolone': {'namespace': 'chebi'...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>{'Death': {'namespace': 'mesh', 'name': 'Death...</td>\n",
+       "      <td>{'annotations': {'doid': {'11394': True}}, 'ci...</td>\n",
+       "      <td>32167524.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>Adverse reactions of IFN-α mainly include low-...</td>\n",
+       "      <td>{'Interferon alfa-2a': {'namespace': 'chebi', ...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>{'Low-grade fever': {'namespace': 'hp', 'name'...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32166483.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                text  \\\n",
+       "0  While blocking TPC2 activity by tetrandrine, a...   \n",
+       "1  Chemoinformatics searches yielded 15 approved ...   \n",
+       "2  Thyroid stimulating hormone and free triiodoth...   \n",
+       "3  The administration of methylprednisolone appea...   \n",
+       "4  Adverse reactions of IFN-α mainly include low-...   \n",
+       "\n",
+       "                                              source  relation  \\\n",
+       "0  {'(+)-Tetrandrine': {'namespace': 'chebi', 'na...      True   \n",
+       "1  {'(S)-verapamil': {'namespace': 'chebi', 'name...      True   \n",
+       "2  {\"3,3',5'-triiodothyronine\": {'namespace': 'ch...      True   \n",
+       "3  {'6-methylprednisolone': {'namespace': 'chebi'...      True   \n",
+       "4  {'Interferon alfa-2a': {'namespace': 'chebi', ...     False   \n",
+       "\n",
+       "                                              target  \\\n",
+       "0  {'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...   \n",
+       "1  {'hypertension': {'namespace': 'doid', 'name':...   \n",
+       "2  {'COVID-19': {'namespace': 'doid', 'name': 'CO...   \n",
+       "3  {'Death': {'namespace': 'mesh', 'name': 'Death...   \n",
+       "4  {'Low-grade fever': {'namespace': 'hp', 'name'...   \n",
+       "\n",
+       "                                                link      pmc_id  \\\n",
+       "0  {'annotations': {}, 'citation': {'authors': ['...  32221306.0   \n",
+       "1  {'annotations': {}, 'citation': {'db': 'DOI', ...         NaN   \n",
+       "2  {'annotations': {'mesh': {'D044967': True}}, '...  32217556.0   \n",
+       "3  {'annotations': {'doid': {'11394': True}}, 'ci...  32167524.0   \n",
+       "4  {'annotations': {}, 'citation': {'authors': ['...  32166483.0   \n",
+       "\n",
+       "                                      doi_id  \n",
+       "0                                        NaN  \n",
+       "1  https://doi.org/10.1101/2020.03.22.002386  \n",
+       "2                                        NaN  \n",
+       "3                                        NaN  \n",
+       "4                                        NaN  "
+      ]
+     },
+     "execution_count": 131,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "example_data = pybel_pd[(pybel_pd['relation']=='negativeCorrelation') | (pybel_pd['relation']=='positiveCorrelation') ]\n",
+    "example_data['relation'] = example_data['relation']=='negativeCorrelation'\n",
+    "example_data.reset_index(inplace=True,drop=True)\n",
+    "example_data.drop('Unnamed: 0',inplace=True,axis=1)\n",
+    "example_data.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.1 Split the data into training and testing \n",
+    "\n",
+    "Ideally should have training , validation and testing set. Also, here , I'm using a fixed testing period but k-fold cross validation techniques are a more robust way of determining the accuracy of the generated labels."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 132,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train,df_test,y_train,y_test = train_test_split(example_data[['text']],example_data[['relation']],test_size=0.20,shuffle=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.2 Reading sentences to understand syntactic differences betweem negative and positive correlation sentences\n",
+    "\n",
+    "The utility of snorkel is that it allows you to create multiple labelling functions which (try to) mimic the rules that a human annotator of data would apply while deciding how to label unlabelled data. For instance a human annotator looking to identify negative correlation is sentences will follow the following rules:\n",
+    "\n",
+    "##### A. Positive rules: rules which tell you what a sentences with negative correlation looks like\n",
+    "\n",
+    "1) does the sentence contain the reduction related words like words 'decreased','reduced','lowered'\n",
+    "\n",
+    "2) does the sentence contain the expression 'negatively correlated' or 'negative correlation' or 'negatively related' or 'inversely related' or 'inverse relation'\n",
+    "\n",
+    "3) does the syntax follow the structure 'increase in x is associated with a decrease in y' (and vice versa)\n",
+    "\n",
+    "4) does the sentence contain the expression 'negative effect'\n",
+    "\n",
+    "5) does the sentence contain the expression 'move in opposite directions'\n",
+    "\n",
+    "##### B. Negative rules: rules which tell you what a sentences without negative correlation looks like\n",
+    "\n",
+    "1) does the sentence contain the increase related words like words 'increased','improved'\n",
+    "\n",
+    "2) does the sentence contain the expression 'positively correlated' or 'positive correlation' or 'positively related'\n",
+    "\n",
+    "3) does the syntax follow the structure 'increase in x is associated with a increase in y' (and vice versa)\n",
+    "\n",
+    "4) does the sentence contain the expression 'positive effect'\n",
+    "\n",
+    "5) does the sentence contain the expression 'move in the same direction'\n",
+    "\n",
+    "These rules can be coded using snorkel. Importantly it requires both positive and negative rules. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 2.2.1 Examining Negative correlation sentences to understand their syntactic structure and then define labelling functions accordingly"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 133,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>text</th>\n",
+       "      <th>source</th>\n",
+       "      <th>relation</th>\n",
+       "      <th>target</th>\n",
+       "      <th>link</th>\n",
+       "      <th>pmc_id</th>\n",
+       "      <th>doi_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>0</td>\n",
+       "      <td>While blocking TPC2 activity by tetrandrine, a...</td>\n",
+       "      <td>{'(+)-Tetrandrine': {'namespace': 'chebi', 'na...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>{'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32221306.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>Chemoinformatics searches yielded 15 approved ...</td>\n",
+       "      <td>{'(S)-verapamil': {'namespace': 'chebi', 'name...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>{'hypertension': {'namespace': 'doid', 'name':...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'db': 'DOI', ...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>https://doi.org/10.1101/2020.03.22.002386</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>Thyroid stimulating hormone and free triiodoth...</td>\n",
+       "      <td>{\"3,3',5'-triiodothyronine\": {'namespace': 'ch...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>{'COVID-19': {'namespace': 'doid', 'name': 'CO...</td>\n",
+       "      <td>{'annotations': {'mesh': {'D044967': True}}, '...</td>\n",
+       "      <td>32217556.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>The administration of methylprednisolone appea...</td>\n",
+       "      <td>{'6-methylprednisolone': {'namespace': 'chebi'...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>{'Death': {'namespace': 'mesh', 'name': 'Death...</td>\n",
+       "      <td>{'annotations': {'doid': {'11394': True}}, 'ci...</td>\n",
+       "      <td>32167524.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>In our opinion, during the COVID-19 pandemic, ...</td>\n",
+       "      <td>{'adrenergic antagonist': {'namespace': 'chebi...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>{'COVID-19': {'namespace': 'doid', 'name': 'CO...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32220710.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5</td>\n",
+       "      <td>Consistent with previous reports, 20mM NH4Cl a...</td>\n",
+       "      <td>{'ammonium chloride': {'namespace': 'chebi', '...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>{'G protein, vesicular stomatitis virus': {'na...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32221306.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>6</td>\n",
+       "      <td>If the latter percentage would be found to be ...</td>\n",
+       "      <td>{'angiotensin receptor antagonist': {'namespac...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>{'COVID-19': {'namespace': 'doid', 'name': 'CO...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32129518.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>7</td>\n",
+       "      <td>Consistent with previous reports, 20mM NH4Cl a...</td>\n",
+       "      <td>{'bafilomycin A1': {'namespace': 'chebi', 'nam...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>{'G protein, vesicular stomatitis virus': {'na...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32221306.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                text  \\\n",
+       "0  While blocking TPC2 activity by tetrandrine, a...   \n",
+       "1  Chemoinformatics searches yielded 15 approved ...   \n",
+       "2  Thyroid stimulating hormone and free triiodoth...   \n",
+       "3  The administration of methylprednisolone appea...   \n",
+       "4  In our opinion, during the COVID-19 pandemic, ...   \n",
+       "5  Consistent with previous reports, 20mM NH4Cl a...   \n",
+       "6  If the latter percentage would be found to be ...   \n",
+       "7  Consistent with previous reports, 20mM NH4Cl a...   \n",
+       "\n",
+       "                                              source  relation  \\\n",
+       "0  {'(+)-Tetrandrine': {'namespace': 'chebi', 'na...      True   \n",
+       "1  {'(S)-verapamil': {'namespace': 'chebi', 'name...      True   \n",
+       "2  {\"3,3',5'-triiodothyronine\": {'namespace': 'ch...      True   \n",
+       "3  {'6-methylprednisolone': {'namespace': 'chebi'...      True   \n",
+       "4  {'adrenergic antagonist': {'namespace': 'chebi...      True   \n",
+       "5  {'ammonium chloride': {'namespace': 'chebi', '...      True   \n",
+       "6  {'angiotensin receptor antagonist': {'namespac...      True   \n",
+       "7  {'bafilomycin A1': {'namespace': 'chebi', 'nam...      True   \n",
+       "\n",
+       "                                              target  \\\n",
+       "0  {'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...   \n",
+       "1  {'hypertension': {'namespace': 'doid', 'name':...   \n",
+       "2  {'COVID-19': {'namespace': 'doid', 'name': 'CO...   \n",
+       "3  {'Death': {'namespace': 'mesh', 'name': 'Death...   \n",
+       "4  {'COVID-19': {'namespace': 'doid', 'name': 'CO...   \n",
+       "5  {'G protein, vesicular stomatitis virus': {'na...   \n",
+       "6  {'COVID-19': {'namespace': 'doid', 'name': 'CO...   \n",
+       "7  {'G protein, vesicular stomatitis virus': {'na...   \n",
+       "\n",
+       "                                                link      pmc_id  \\\n",
+       "0  {'annotations': {}, 'citation': {'authors': ['...  32221306.0   \n",
+       "1  {'annotations': {}, 'citation': {'db': 'DOI', ...         NaN   \n",
+       "2  {'annotations': {'mesh': {'D044967': True}}, '...  32217556.0   \n",
+       "3  {'annotations': {'doid': {'11394': True}}, 'ci...  32167524.0   \n",
+       "4  {'annotations': {}, 'citation': {'authors': ['...  32220710.0   \n",
+       "5  {'annotations': {}, 'citation': {'authors': ['...  32221306.0   \n",
+       "6  {'annotations': {}, 'citation': {'authors': ['...  32129518.0   \n",
+       "7  {'annotations': {}, 'citation': {'authors': ['...  32221306.0   \n",
+       "\n",
+       "                                      doi_id  \n",
+       "0                                        NaN  \n",
+       "1  https://doi.org/10.1101/2020.03.22.002386  \n",
+       "2                                        NaN  \n",
+       "3                                        NaN  \n",
+       "4                                        NaN  \n",
+       "5                                        NaN  \n",
+       "6                                        NaN  \n",
+       "7                                        NaN  "
+      ]
+     },
+     "execution_count": 133,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "neg_correl_df = example_data[example_data['relation']==1]\n",
+    "neg_correl_df.reset_index(inplace=True,drop=True)\n",
+    "neg_correl_df.head(8)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 134,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'While blocking TPC2 activity by tetrandrine, an inhibitor for TPC237, decreased entry of SARS-CoV-2 S pseudovirions (Fig. 3f), treatment of cells with 130, a TRPML1 inhibitor, had no effect (Supplementary Fig. 1), indicating that TPC2, not TRPML1, is important for SARS-CoV-2 entry.'"
+      ]
+     },
+     "execution_count": 134,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "neg_correl_df['text'][0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 2.2.2 Examining positive correlation sentences to understand their syntactic structure and then define labelling funtions accordingly"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 135,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>text</th>\n",
+       "      <th>source</th>\n",
+       "      <th>relation</th>\n",
+       "      <th>target</th>\n",
+       "      <th>link</th>\n",
+       "      <th>pmc_id</th>\n",
+       "      <th>doi_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>0</td>\n",
+       "      <td>Adverse reactions of IFN-α mainly include low-...</td>\n",
+       "      <td>{'Interferon alfa-2a': {'namespace': 'chebi', ...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>{'Low-grade fever': {'namespace': 'hp', 'name'...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32166483.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>Adverse reactions of IFN-α mainly include low-...</td>\n",
+       "      <td>{'Interferon alfa-2a': {'namespace': 'chebi', ...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>{'influenza': {'namespace': 'doid', 'name': 'i...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32166483.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>This may be accounted for by two complementary...</td>\n",
+       "      <td>{'angiotensin II': {'namespace': 'chebi', 'nam...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>{'COVID-19': {'namespace': 'doid', 'name': 'CO...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32129518.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>ACE2 can also antagonize cardiac fibrosis and ...</td>\n",
+       "      <td>{'angiotensin II': {'namespace': 'chebi', 'nam...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>{'Ventricular Remodeling': {'namespace': 'mesh...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32221983.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>ACE2 can also antagonize cardiac fibrosis and ...</td>\n",
+       "      <td>{'angiotensin II': {'namespace': 'chebi', 'nam...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>{'Myocardial fibrosis': {'namespace': 'hp', 'n...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32221983.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5</td>\n",
+       "      <td>The existence of significantly increased fibri...</td>\n",
+       "      <td>{'Fibrin': {'namespace': 'chebi', 'name': 'Fib...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>{'Hyperfibrinolysis': {'namespace': 'hp', 'nam...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32216698.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>6</td>\n",
+       "      <td>This opinion is supported by the presence of h...</td>\n",
+       "      <td>{'Fibrin': {'namespace': 'chebi', 'name': 'Fib...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>{'Hemorrhage': {'namespace': 'mesh', 'name': '...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32216698.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>7</td>\n",
+       "      <td>In the influenza virus model, it was reported ...</td>\n",
+       "      <td>{'chloroquine': {'namespace': 'chebi', 'name':...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>{'dendritic cell antigen processing and presen...</td>\n",
+       "      <td>{'annotations': {'mesh': {'D007251': True}}, '...</td>\n",
+       "      <td>32171740.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                text  \\\n",
+       "0  Adverse reactions of IFN-α mainly include low-...   \n",
+       "1  Adverse reactions of IFN-α mainly include low-...   \n",
+       "2  This may be accounted for by two complementary...   \n",
+       "3  ACE2 can also antagonize cardiac fibrosis and ...   \n",
+       "4  ACE2 can also antagonize cardiac fibrosis and ...   \n",
+       "5  The existence of significantly increased fibri...   \n",
+       "6  This opinion is supported by the presence of h...   \n",
+       "7  In the influenza virus model, it was reported ...   \n",
+       "\n",
+       "                                              source  relation  \\\n",
+       "0  {'Interferon alfa-2a': {'namespace': 'chebi', ...     False   \n",
+       "1  {'Interferon alfa-2a': {'namespace': 'chebi', ...     False   \n",
+       "2  {'angiotensin II': {'namespace': 'chebi', 'nam...     False   \n",
+       "3  {'angiotensin II': {'namespace': 'chebi', 'nam...     False   \n",
+       "4  {'angiotensin II': {'namespace': 'chebi', 'nam...     False   \n",
+       "5  {'Fibrin': {'namespace': 'chebi', 'name': 'Fib...     False   \n",
+       "6  {'Fibrin': {'namespace': 'chebi', 'name': 'Fib...     False   \n",
+       "7  {'chloroquine': {'namespace': 'chebi', 'name':...     False   \n",
+       "\n",
+       "                                              target  \\\n",
+       "0  {'Low-grade fever': {'namespace': 'hp', 'name'...   \n",
+       "1  {'influenza': {'namespace': 'doid', 'name': 'i...   \n",
+       "2  {'COVID-19': {'namespace': 'doid', 'name': 'CO...   \n",
+       "3  {'Ventricular Remodeling': {'namespace': 'mesh...   \n",
+       "4  {'Myocardial fibrosis': {'namespace': 'hp', 'n...   \n",
+       "5  {'Hyperfibrinolysis': {'namespace': 'hp', 'nam...   \n",
+       "6  {'Hemorrhage': {'namespace': 'mesh', 'name': '...   \n",
+       "7  {'dendritic cell antigen processing and presen...   \n",
+       "\n",
+       "                                                link      pmc_id doi_id  \n",
+       "0  {'annotations': {}, 'citation': {'authors': ['...  32166483.0    NaN  \n",
+       "1  {'annotations': {}, 'citation': {'authors': ['...  32166483.0    NaN  \n",
+       "2  {'annotations': {}, 'citation': {'authors': ['...  32129518.0    NaN  \n",
+       "3  {'annotations': {}, 'citation': {'authors': ['...  32221983.0    NaN  \n",
+       "4  {'annotations': {}, 'citation': {'authors': ['...  32221983.0    NaN  \n",
+       "5  {'annotations': {}, 'citation': {'authors': ['...  32216698.0    NaN  \n",
+       "6  {'annotations': {}, 'citation': {'authors': ['...  32216698.0    NaN  \n",
+       "7  {'annotations': {'mesh': {'D007251': True}}, '...  32171740.0    NaN  "
+      ]
+     },
+     "execution_count": 135,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "positive_relation_df = example_data[example_data['relation']==0]\n",
+    "positive_relation_df.reset_index(inplace=True,drop=True)\n",
+    "positive_relation_df.head(8)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 136,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Adverse reactions of IFN-α mainly include low-grade fever and flu-like symptoms (both in children with intramuscularly injection) [11].'"
+      ]
+     },
+     "execution_count": 136,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "positive_relation_df['text'][0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.3 Source-Target dictionary"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "A simple but clean rule for identifying negative correlation sentences would be if negative tokens occured in the words between the source and the target. So, a source-target dictonary is created for some of the examples (in the final pipeline the source target dictonary will be obtained from the spacy pipeline)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 137,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('tetrandrine', 'TPC2'),\n",
+       " ('triiodothyronine', 'recovered'),\n",
+       " ('methylprednisolone', 'death'),\n",
+       " ('IFN-α', 'fever'),\n",
+       " ('angiotensin', 'vasodilator'),\n",
+       " ('ACE2', 'Ang'),\n",
+       " ('fibrin', 'COVID-19'),\n",
+       " ('hemorrhage', 'fibrinolysis'),\n",
+       " ('chloroquine', 'dendritic')]"
+      ]
+     },
+     "execution_count": 137,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "example_sources = ['tetrandrine','triiodothyronine','methylprednisolone','IFN-α','angiotensin','ACE2','fibrin','hemorrhage','chloroquine']\n",
+    "\n",
+    "example_targets = ['TPC2','recovered','death','fever','vasodilator','Ang','COVID-19','fibrinolysis','dendritic'] #low-grade fever\n",
+    "\n",
+    "example_source_target_dict = list(OrderedDict.fromkeys(zip(example_sources,example_targets)))\n",
+    "example_source_target_dict"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.4 Labeling functions for RE\n",
+    "\n",
+    "#### 2.4.1 Preprocessing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 138,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spacy = SpacyPreprocessor(text_field=\"text\", doc_field=\"doc\", memoize=True)\n",
+    "\n",
+    "ABSTAIN = -1\n",
+    "NOT_FOUND = 0\n",
+    "FOUND = 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 139,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Source target pair:  ('tetrandrine', 'TPC2')\n"
+     ]
+    }
+   ],
+   "source": [
+    "@preprocessor(pre=[spacy])\n",
+    "def get_source_target(cand: DataPoint) -> DataPoint:\n",
+    "    \"\"\"\n",
+    "    Returns the source and target mentioned in the sentence\n",
+    "    \"\"\"\n",
+    "    person_names = []\n",
+    "\n",
+    "    source = [token.text for token in cand.doc if token.text in example_sources]\n",
+    "    target = [token.text for token in cand.doc if token.text in example_targets]\n",
+    "    \n",
+    "    try:\n",
+    "        cand.source_target = (source[0],target[0])\n",
+    "    except:\n",
+    "        cand.source_target = (np.nan,np.nan)\n",
+    "    return cand\n",
+    "\n",
+    "########### function example #####################\n",
+    "\n",
+    "candidate = example_data.loc[0]\n",
+    "candidate_with_function_applied = get_source_target(candidate) \n",
+    "\n",
+    "print(\"Source target pair: \", candidate_with_function_applied.source_target)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 140,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sentence:  Thyroid stimulating hormone and free triiodothyronine concentrations were significantly lower in deceased patients (0.7 mIU/mL and 2.8 pmol/L) than in recovered patients (1.4 mIU/mL and 4.3 pmol/L).\n",
+      "Text Between:  triiodothyronine concentrations were significantly lower in deceased patients (0.7 mIU/mL and 2.8 pmol/L) than in\n",
+      "Text Between:  Thyroid stimulating hormone and free\n"
+     ]
+    }
+   ],
+   "source": [
+    "@preprocessor(pre=[spacy])\n",
+    "def get_text_between(cand: DataPoint) -> DataPoint:\n",
+    "    \"\"\"\n",
+    "    Returns the text between a source target pair\n",
+    "    \"\"\"\n",
+    "    person_names = []\n",
+    "\n",
+    "    source_idx = [token.i for token in cand.doc if token.text in example_sources]\n",
+    "    target_idx = [token.i for token in cand.doc if token.text in example_targets]\n",
+    "    \n",
+    "    try:\n",
+    "\n",
+    "        if (len(target_idx)==1) & (len(source_idx)==1) & (source_idx[0]<target_idx[0]):\n",
+    "            cand.text_between = cand.doc[source_idx[0]:target_idx[0]]\n",
+    "            cand.text_to_source_left = cand.doc[:source_idx[0]]\n",
+    "            \n",
+    "        elif (len(target_idx)>1) & (len(source_idx)==1):\n",
+    "            for target_index in target_idx:\n",
+    "                if source_idx[0]<target_index:\n",
+    "                    cand.text_between = cand.doc[source_idx[0]:target_index]\n",
+    "                    cand.text_to_source_left = cand.doc[:source_idx[0]]\n",
+    "                    \n",
+    "        elif (len(source_idx)>1) & (len(target_idx)==1):\n",
+    "            for source_index in source_idx:\n",
+    "                if source_index<target_idx[0]:\n",
+    "                    cand.text_between = cand.doc[source_index:target_idx[0]]\n",
+    "                    cand.text_to_source_left = cand.doc[:source_index]\n",
+    "                    \n",
+    "        elif (len(source_idx)>1) & (len(target_idx)>1):\n",
+    "            for source_index in source_idx:\n",
+    "                for target_index in target_idx:\n",
+    "                    if source_index<target_index:\n",
+    "                        cand.text_between = cand.doc[source_index:target_index]\n",
+    "                        cand.text_to_source_left = cand.doc[:source_index]\n",
+    "                        \n",
+    "        else:\n",
+    "            cand.text_between = 'NaN'\n",
+    "            cand.text_to_source_left = 'NaN'\n",
+    "    except:\n",
+    "        \n",
+    "        cand.text_between = 'NaN'\n",
+    "        cand.text_to_source_left = 'NaN'\n",
+    "        \n",
+    "    return cand\n",
+    "\n",
+    "############ function example ###############################\n",
+    "candidate = example_data.loc[2]\n",
+    "\n",
+    "candidate_with_function_applied = get_text_between(candidate)\n",
+    "\n",
+    "print(\"Sentence: \", candidate[\"text\"])\n",
+    "print(\"Text Between: \", candidate_with_function_applied.text_between)\n",
+    "print(\"Text Between: \", candidate_with_function_applied.text_to_source_left)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 2.4.2 Defining the labelling functions\n",
+    "\n",
+    "For the final labelling model to work, at least 3 rules are needed.\n",
+    "\n",
+    "##### A. Positive rules: rules which tell you what a sentences with negative correlation looks like\n",
+    "    \n",
+    "1) does the sentence contain the reduction related words like words 'decreased','reduced','lowered'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 141,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reduction_tokens = {'decreased',\n",
+    "                            'lower',\n",
+    "                            'reduced',\n",
+    "}\n",
+    "\n",
+    "@labeling_function(pre=[spacy],resources=dict(reduction_tokens=reduction_tokens))\n",
+    "def contains_reduction_tokens(x,reduction_tokens):\n",
+    "    \n",
+    "    tokens = [str(token) for token in x.doc]\n",
+    "    return FOUND if len(reduction_tokens.intersection(set(tokens))) > 0 else ABSTAIN\n",
+    "\n",
+    "#positive rule - version 2\n",
+    "@labeling_function(pre=[spacy,get_text_between],resources=dict(reduction_tokens=reduction_tokens))\n",
+    "def contains_reduction_tokens_text_between(x,reduction_tokens):\n",
+    "    relation_text = x.text_between\n",
+    "    relation_text_tokens = [str(token) for token in relation_text]\n",
+    "    return FOUND if len(reduction_tokens.intersection(set(relation_text_tokens))) > 0 else ABSTAIN\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "2) does the sentence contain the expression 'negatively correlated' or 'negative correlation' or 'negatively related' or 'inversely related' or 'inverse relation'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 142,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "negative_correlation_regex_1 = 'negative correlation'\n",
+    "negative_correlation_regex_2 = 'negatively correlated'\n",
+    "negative_correlation_regex_3 = 'negatively related'\n",
+    "negative_correlation_regex_4 = 'inversely related'\n",
+    "negative_correlation_regex_5 = 'inverse relation'\n",
+    "negative_correlation_regex_6 = 'negative effect'\n",
+    "negative_correlation_regex_7 = 'move in opposite directions'\n",
+    "\n",
+    "@labeling_function()\n",
+    "def contains_negative_corrrelation_regex(x):\n",
+    "    if re.search(negative_correlation_regex_1, x.text, flags=re.I):\n",
+    "        return FOUND\n",
+    "    elif re.search(negative_correlation_regex_2, x.text, flags=re.I):\n",
+    "        return FOUND\n",
+    "    elif re.search(negative_correlation_regex_3, x.text, flags=re.I):\n",
+    "        return FOUND\n",
+    "    elif re.search(negative_correlation_regex_4, x.text, flags=re.I):\n",
+    "        return FOUND\n",
+    "    elif re.search(negative_correlation_regex_5, x.text, flags=re.I):\n",
+    "        return FOUND\n",
+    "    elif re.search(negative_correlation_regex_6, x.text, flags=re.I):\n",
+    "        return FOUND\n",
+    "    elif re.search(negative_correlation_regex_7, x.text, flags=re.I):\n",
+    "        return FOUND\n",
+    "\n",
+    "    else: \n",
+    "        return ABSTAIN\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "3) does the syntax follow the structure 'increase in x is associated with a decrease in y' (and vice versa)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 143,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@labeling_function(pre=[spacy,get_text_between])\n",
+    "def contains_increase_decrease_pattern(x):\n",
+    "    if ('increase' in [str(token) for token in x.text_to_source_left]) & ('decrease' in [str(token) for token in x.text_between]):\n",
+    "        return FOUND\n",
+    "    elif ('decrease' in [str(token) for token in x.text_to_source_left]) & ('increase' in [str(token) for token in x.text_between]):\n",
+    "        return FOUND\n",
+    "    else:\n",
+    "        return ABSTAIN"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### B. Negative rules: rules which tell you what a sentences without negative correlation looks like\n",
+    "\n",
+    "1) does the sentence contain the increase related words like words 'increased','higher'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 144,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "increase_tokens = {'increased',\n",
+    "                            'higher',\n",
+    "}\n",
+    "\n",
+    "@labeling_function(pre=[spacy],resources=dict(increase_tokens=increase_tokens))\n",
+    "def contains_increase_tokens(x,increase_tokens):\n",
+    "    tokens = [str(token) for token in x.doc]\n",
+    "    return NOT_FOUND if len(increase_tokens.intersection(set(tokens))) > 0 else ABSTAIN\n",
+    "\n",
+    "\n",
+    "@labeling_function(pre=[spacy,get_text_between],resources=dict(increase_tokens=increase_tokens))\n",
+    "def contains_increase_tokens_text_between(x, increase_tokens):\n",
+    "    relation_text = x.text_between\n",
+    "    relation_text_tokens = [str(token) for token in relation_text]\n",
+    "    return NOT_FOUND if len(increase_tokens.intersection(set(relation_text_tokens))) > 0 else ABSTAIN"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "2) does the sentence contain the expression 'positively correlated' or 'positive correlation' or 'positively related' or 'positive effect' or 'move in the same direction'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 145,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#regex\n",
+    "positive_correlation_regex_1 = 'positive correlation'\n",
+    "positive_correlation_regex_2 = 'positively correlated'\n",
+    "positive_correlation_regex_3 = 'positively related'\n",
+    "positive_correlation_regex_4 = 'positive effect'\n",
+    "positive_correlation_regex_5 = 'move in the same direction'\n",
+    "\n",
+    "@labeling_function()\n",
+    "def contains_positive_corrrelation_regex(x):\n",
+    "    if re.search(positive_correlation_regex_1, x.text, flags=re.I):\n",
+    "        return FOUND\n",
+    "    elif re.search(positive_correlation_regex_2, x.text, flags=re.I):\n",
+    "        return FOUND\n",
+    "    elif re.search(positive_correlation_regex_3, x.text, flags=re.I):\n",
+    "        return FOUND\n",
+    "    elif re.search(positive_correlation_regex_4, x.text, flags=re.I):\n",
+    "        return FOUND\n",
+    "    elif re.search(positive_correlation_regex_5, x.text, flags=re.I):\n",
+    "        return FOUND    \n",
+    "    else:\n",
+    "        return ABSTAIN"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "3) does the syntax follow the structure 'increase in x is associated with a increase in y' (and vice versa)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 146,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@labeling_function(pre=[spacy,get_text_between])\n",
+    "def contains_increase_increase_pattern(x):\n",
+    "    if ('increase' in [str(token) for token in x.text_to_source_left]) & ('increase' in [str(token) for token in x.text_between]):\n",
+    "        return FOUND\n",
+    "    elif ('decrease' in [str(token) for token in x.text_to_source_left]) & ('decrease' in [str(token) for token in x.text_between]):\n",
+    "        return FOUND\n",
+    "    else:\n",
+    "        return ABSTAIN"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.5 Creating all the labels for the different rules"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 147,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\Cafral\\Anaconda3\\lib\\site-packages\\tqdm\\std.py:658: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n",
+      "  from pandas import Panel\n",
+      "\n",
+      "  0%|                                                                                         | 0/2227 [00:00<?, ?it/s]\u001b[A\n",
+      "  0%|▏                                                                                | 4/2227 [00:00<01:02, 35.32it/s]\u001b[A\n",
+      "  0%|▎                                                                                | 9/2227 [00:00<00:59, 36.97it/s]\u001b[A\n",
+      "  1%|▌                                                                               | 16/2227 [00:00<00:51, 43.02it/s]\u001b[A\n",
+      "  1%|▋                                                                               | 20/2227 [00:00<00:54, 40.82it/s]\u001b[A\n",
+      "  1%|▊                                                                               | 24/2227 [00:00<00:54, 40.45it/s]\u001b[A\n",
+      "  1%|█                                                                               | 29/2227 [00:00<00:51, 42.66it/s]\u001b[A\n",
+      "  2%|█▎                                                                              | 35/2227 [00:00<00:49, 44.55it/s]\u001b[A\n",
+      "  2%|█▍                                                                              | 40/2227 [00:00<00:50, 42.99it/s]\u001b[A\n",
+      "  2%|█▌                                                                              | 45/2227 [00:01<00:54, 39.87it/s]\u001b[A\n",
+      "  3%|██▏                                                                             | 61/2227 [00:01<00:42, 51.21it/s]\u001b[A\n",
+      "  3%|██▍                                                                             | 69/2227 [00:01<00:40, 53.77it/s]\u001b[A\n",
+      "  3%|██▊                                                                             | 77/2227 [00:01<00:36, 58.84it/s]\u001b[A\n",
+      "  4%|███▏                                                                            | 88/2227 [00:01<00:31, 67.95it/s]\u001b[A\n",
+      "  4%|███▍                                                                            | 97/2227 [00:01<00:34, 62.02it/s]\u001b[A\n",
+      "  5%|████                                                                           | 113/2227 [00:01<00:28, 73.29it/s]\u001b[A\n",
+      "  6%|████▋                                                                          | 132/2227 [00:01<00:23, 89.58it/s]\u001b[A\n",
+      "  7%|█████▎                                                                        | 153/2227 [00:01<00:19, 108.20it/s]\u001b[A\n",
+      "  8%|██████▎                                                                       | 181/2227 [00:02<00:15, 132.17it/s]\u001b[A\n",
+      "  9%|███████                                                                       | 200/2227 [00:02<00:14, 139.37it/s]\u001b[A\n",
+      " 10%|███████▋                                                                      | 218/2227 [00:02<00:16, 125.24it/s]\u001b[A\n",
+      " 11%|████████▎                                                                      | 234/2227 [00:02<00:24, 82.06it/s]\u001b[A\n",
+      " 11%|████████▊                                                                      | 247/2227 [00:02<00:26, 73.44it/s]\u001b[A\n",
+      " 12%|█████████▏                                                                     | 258/2227 [00:03<00:26, 73.62it/s]\u001b[A\n",
+      " 12%|█████████▌                                                                     | 268/2227 [00:03<00:33, 58.67it/s]\u001b[A\n",
+      " 12%|█████████▊                                                                     | 276/2227 [00:03<00:44, 44.08it/s]\u001b[A\n",
+      " 13%|██████████                                                                     | 283/2227 [00:03<00:45, 42.30it/s]\u001b[A\n",
+      " 13%|██████████▎                                                                    | 289/2227 [00:03<00:42, 45.53it/s]\u001b[A\n",
+      " 13%|██████████▌                                                                    | 299/2227 [00:04<00:35, 54.16it/s]\u001b[A\n",
+      " 14%|██████████▉                                                                    | 307/2227 [00:04<00:34, 55.86it/s]\u001b[A\n",
+      " 14%|███████████▏                                                                   | 314/2227 [00:04<00:38, 49.07it/s]\u001b[A\n",
+      " 14%|███████████▍                                                                   | 322/2227 [00:04<00:34, 55.50it/s]\u001b[A\n",
+      " 15%|███████████▋                                                                   | 329/2227 [00:04<00:34, 54.91it/s]\u001b[A\n",
+      " 15%|███████████▉                                                                   | 336/2227 [00:04<00:34, 54.86it/s]\u001b[A\n",
+      " 16%|████████████▎                                                                  | 348/2227 [00:04<00:28, 65.38it/s]\u001b[A\n",
+      " 16%|████████████▉                                                                  | 364/2227 [00:04<00:24, 75.68it/s]\u001b[A\n",
+      " 17%|█████████████▌                                                                 | 381/2227 [00:05<00:20, 90.75it/s]\u001b[A\n",
+      " 18%|█████████████▉                                                                 | 393/2227 [00:05<00:20, 88.80it/s]\u001b[A\n",
+      " 19%|██████████████▋                                                               | 419/2227 [00:05<00:16, 110.40it/s]\u001b[A\n",
+      " 20%|███████████████▍                                                              | 441/2227 [00:05<00:13, 129.33it/s]\u001b[A\n",
+      " 21%|████████████████                                                              | 459/2227 [00:05<00:13, 131.51it/s]\u001b[A\n",
+      " 21%|████████████████▋                                                             | 476/2227 [00:05<00:12, 139.84it/s]\u001b[A\n",
+      " 22%|█████████████████▎                                                            | 493/2227 [00:05<00:15, 112.09it/s]\u001b[A\n",
+      " 23%|█████████████████▊                                                            | 507/2227 [00:05<00:16, 106.92it/s]\u001b[A\n",
+      " 24%|██████████████████▎                                                           | 524/2227 [00:06<00:14, 120.04it/s]\u001b[A\n",
+      " 24%|███████████████████                                                           | 545/2227 [00:06<00:12, 136.86it/s]\u001b[A\n",
+      " 25%|███████████████████▊                                                          | 565/2227 [00:06<00:11, 149.93it/s]\u001b[A\n",
+      " 26%|████████████████████▍                                                         | 583/2227 [00:06<00:10, 157.50it/s]\u001b[A\n",
+      " 27%|█████████████████████                                                         | 601/2227 [00:06<00:10, 156.92it/s]\u001b[A\n",
+      " 28%|█████████████████████▋                                                        | 619/2227 [00:06<00:09, 162.40it/s]\u001b[A\n",
+      " 29%|██████████████████████▌                                                       | 644/2227 [00:06<00:08, 181.28it/s]\u001b[A\n",
+      " 30%|███████████████████████▎                                                      | 665/2227 [00:06<00:08, 188.40it/s]\u001b[A\n",
+      " 31%|███████████████████████▉                                                      | 685/2227 [00:06<00:08, 184.86it/s]\u001b[A\n",
+      " 32%|████████████████████████▋                                                     | 705/2227 [00:07<00:08, 183.00it/s]\u001b[A\n",
+      " 33%|█████████████████████████▍                                                    | 725/2227 [00:07<00:08, 187.58it/s]\u001b[A\n",
+      " 34%|██████████████████████████▏                                                   | 748/2227 [00:07<00:07, 192.09it/s]\u001b[A\n",
+      " 34%|██████████████████████████▉                                                   | 768/2227 [00:07<00:08, 180.88it/s]\u001b[A\n",
+      " 35%|███████████████████████████▌                                                  | 787/2227 [00:07<00:08, 176.97it/s]\u001b[A\n",
+      " 36%|████████████████████████████▏                                                 | 805/2227 [00:07<00:10, 136.68it/s]\u001b[A\n",
+      " 37%|████████████████████████████▊                                                 | 821/2227 [00:07<00:12, 117.14it/s]\u001b[A\n",
+      " 37%|█████████████████████████████▌                                                 | 835/2227 [00:08<00:16, 82.38it/s]\u001b[A\n",
+      " 38%|██████████████████████████████                                                 | 846/2227 [00:08<00:21, 63.65it/s]\u001b[A\n",
+      " 38%|██████████████████████████████▎                                                | 855/2227 [00:08<00:25, 53.53it/s]\u001b[A\n",
+      " 39%|██████████████████████████████▌                                                | 863/2227 [00:08<00:25, 53.50it/s]\u001b[A\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 39%|██████████████████████████████▊                                                | 870/2227 [00:08<00:27, 49.89it/s]\u001b[A\n",
+      " 40%|███████████████████████████████▍                                               | 887/2227 [00:09<00:21, 63.09it/s]\u001b[A\n",
+      " 41%|███████████████████████████████▉                                               | 902/2227 [00:09<00:17, 75.94it/s]\u001b[A\n",
+      " 41%|████████████████████████████████▍                                              | 913/2227 [00:09<00:21, 62.34it/s]\u001b[A\n",
+      " 41%|████████████████████████████████▋                                              | 922/2227 [00:09<00:21, 60.38it/s]\u001b[A\n",
+      " 42%|████████████████████████████████▉                                              | 930/2227 [00:09<00:22, 58.34it/s]\u001b[A\n",
+      " 42%|█████████████████████████████████▎                                             | 939/2227 [00:09<00:20, 63.84it/s]\u001b[A\n",
+      " 43%|█████████████████████████████████▋                                             | 951/2227 [00:09<00:17, 73.07it/s]\u001b[A\n",
+      " 43%|██████████████████████████████████                                             | 961/2227 [00:10<00:17, 70.72it/s]\u001b[A\n",
+      " 44%|██████████████████████████████████▎                                            | 969/2227 [00:10<00:22, 56.28it/s]\u001b[A\n",
+      " 44%|██████████████████████████████████▌                                            | 976/2227 [00:10<00:23, 54.06it/s]\u001b[A\n",
+      " 44%|██████████████████████████████████▊                                            | 983/2227 [00:10<00:23, 53.22it/s]\u001b[A\n",
+      " 44%|███████████████████████████████████                                            | 989/2227 [00:10<00:26, 46.14it/s]\u001b[A\n",
+      " 45%|███████████████████████████████████▎                                           | 995/2227 [00:10<00:25, 49.12it/s]\u001b[A\n",
+      " 45%|███████████████████████████████████                                           | 1001/2227 [00:11<00:26, 45.93it/s]\u001b[A\n",
+      " 45%|███████████████████████████████████▍                                          | 1012/2227 [00:11<00:22, 54.76it/s]\u001b[A\n",
+      " 46%|███████████████████████████████████▋                                          | 1019/2227 [00:11<00:23, 52.42it/s]\u001b[A\n",
+      " 46%|████████████████████████████████████▎                                         | 1035/2227 [00:11<00:18, 65.65it/s]\u001b[A\n",
+      " 47%|████████████████████████████████████▌                                         | 1044/2227 [00:11<00:17, 68.59it/s]\u001b[A\n",
+      " 47%|████████████████████████████████████▉                                         | 1053/2227 [00:11<00:20, 58.18it/s]\u001b[A\n",
+      " 48%|█████████████████████████████████████▏                                        | 1061/2227 [00:11<00:21, 54.12it/s]\u001b[A\n",
+      " 48%|█████████████████████████████████████▌                                        | 1073/2227 [00:12<00:18, 63.03it/s]\u001b[A\n",
+      " 49%|█████████████████████████████████████▊                                        | 1081/2227 [00:12<00:21, 52.77it/s]\u001b[A\n",
+      " 49%|██████████████████████████████████████▏                                       | 1090/2227 [00:12<00:19, 58.99it/s]\u001b[A\n",
+      " 49%|██████████████████████████████████████▌                                       | 1102/2227 [00:12<00:16, 69.14it/s]\u001b[A\n",
+      " 50%|██████████████████████████████████████▉                                       | 1112/2227 [00:12<00:15, 72.89it/s]\u001b[A\n",
+      " 50%|███████████████████████████████████████▎                                      | 1121/2227 [00:12<00:14, 75.90it/s]\u001b[A\n",
+      " 51%|███████████████████████████████████████▊                                      | 1138/2227 [00:12<00:12, 90.66it/s]\u001b[A\n",
+      " 52%|████████████████████████████████████████▎                                     | 1151/2227 [00:12<00:10, 99.05it/s]\u001b[A\n",
+      " 52%|████████████████████████████████████████▋                                     | 1163/2227 [00:12<00:10, 98.04it/s]\u001b[A\n",
+      " 53%|████████████████████████████████████████▊                                    | 1180/2227 [00:13<00:09, 111.73it/s]\u001b[A\n",
+      " 54%|█████████████████████████████████████████▏                                   | 1193/2227 [00:13<00:09, 112.70it/s]\u001b[A\n",
+      " 54%|█████████████████████████████████████████▋                                   | 1207/2227 [00:13<00:08, 115.89it/s]\u001b[A\n",
+      " 55%|██████████████████████████████████████████▎                                  | 1224/2227 [00:13<00:07, 128.12it/s]\u001b[A\n",
+      " 56%|██████████████████████████████████████████▊                                  | 1239/2227 [00:13<00:07, 132.43it/s]\u001b[A\n",
+      " 56%|███████████████████████████████████████████▎                                 | 1253/2227 [00:13<00:09, 100.50it/s]\u001b[A\n",
+      " 57%|████████████████████████████████████████████▎                                 | 1265/2227 [00:13<00:09, 97.93it/s]\u001b[A\n",
+      " 57%|████████████████████████████████████████████▏                                | 1277/2227 [00:13<00:09, 102.33it/s]\u001b[A\n",
+      " 58%|█████████████████████████████████████████████▏                                | 1289/2227 [00:14<00:12, 77.26it/s]\u001b[A\n",
+      " 58%|█████████████████████████████████████████████▍                                | 1299/2227 [00:14<00:13, 69.14it/s]\u001b[A\n",
+      " 59%|█████████████████████████████████████████████▊                                | 1309/2227 [00:14<00:12, 75.56it/s]\u001b[A\n",
+      " 60%|██████████████████████████████████████████████▌                               | 1330/2227 [00:14<00:09, 93.25it/s]\u001b[A\n",
+      " 60%|███████████████████████████████████████████████                               | 1343/2227 [00:14<00:08, 98.27it/s]\u001b[A\n",
+      " 61%|███████████████████████████████████████████████▏                             | 1363/2227 [00:14<00:07, 114.93it/s]\u001b[A\n",
+      " 62%|███████████████████████████████████████████████▋                             | 1379/2227 [00:14<00:06, 125.14it/s]\u001b[A\n",
+      " 63%|████████████████████████████████████████████████▎                            | 1396/2227 [00:15<00:06, 133.88it/s]\u001b[A\n",
+      " 63%|█████████████████████████████████████████████████▍                            | 1411/2227 [00:15<00:12, 64.64it/s]\u001b[A\n",
+      " 64%|█████████████████████████████████████████████████▊                            | 1423/2227 [00:15<00:14, 54.47it/s]\u001b[A\n",
+      " 65%|██████████████████████████████████████████████████▌                           | 1442/2227 [00:15<00:11, 69.27it/s]\u001b[A\n",
+      " 66%|███████████████████████████████████████████████████                           | 1459/2227 [00:16<00:09, 82.11it/s]\u001b[A\n",
+      " 66%|███████████████████████████████████████████████████▌                          | 1472/2227 [00:16<00:08, 90.32it/s]\u001b[A\n",
+      " 67%|████████████████████████████████████████████████████                          | 1485/2227 [00:16<00:07, 96.08it/s]\u001b[A\n",
+      " 67%|███████████████████████████████████████████████████▊                         | 1498/2227 [00:16<00:07, 100.63it/s]\u001b[A\n",
+      " 68%|████████████████████████████████████████████████████▎                        | 1513/2227 [00:16<00:06, 111.34it/s]\u001b[A\n",
+      " 69%|█████████████████████████████████████████████████████                        | 1533/2227 [00:16<00:05, 127.75it/s]\u001b[A\n",
+      " 70%|██████████████████████████████████████████████████████▏                       | 1548/2227 [00:16<00:06, 99.87it/s]\u001b[A\n",
+      " 70%|██████████████████████████████████████████████████████▋                       | 1561/2227 [00:17<00:09, 70.40it/s]\u001b[A\n",
+      " 71%|███████████████████████████████████████████████████████                       | 1571/2227 [00:17<00:09, 69.84it/s]\u001b[A\n",
+      " 71%|███████████████████████████████████████████████████████▍                      | 1584/2227 [00:17<00:08, 78.10it/s]\u001b[A\n",
+      " 72%|███████████████████████████████████████████████████████▊                      | 1595/2227 [00:17<00:07, 83.84it/s]\u001b[A\n",
+      " 72%|████████████████████████████████████████████████████████▏                     | 1605/2227 [00:17<00:08, 75.55it/s]\u001b[A\n",
+      " 72%|████████████████████████████████████████████████████████▌                     | 1614/2227 [00:17<00:07, 77.72it/s]\u001b[A\n",
+      " 73%|████████████████████████████████████████████████████████▉                     | 1624/2227 [00:17<00:07, 82.39it/s]\u001b[A\n",
+      " 74%|█████████████████████████████████████████████████████████▋                    | 1647/2227 [00:18<00:05, 99.88it/s]\u001b[A\n",
+      " 75%|█████████████████████████████████████████████████████████▍                   | 1660/2227 [00:18<00:05, 103.38it/s]\u001b[A\n",
+      " 75%|██████████████████████████████████████████████████████████▌                   | 1672/2227 [00:18<00:05, 99.51it/s]\u001b[A\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 76%|██████████████████████████████████████████████████████████▏                  | 1684/2227 [00:18<00:05, 102.91it/s]\u001b[A\n",
+      " 76%|██████████████████████████████████████████████████████████▋                  | 1697/2227 [00:18<00:04, 109.63it/s]\u001b[A\n",
+      " 77%|███████████████████████████████████████████████████████████▏                 | 1712/2227 [00:18<00:04, 119.03it/s]\u001b[A\n",
+      " 78%|███████████████████████████████████████████████████████████▉                 | 1733/2227 [00:18<00:03, 136.08it/s]\u001b[A\n",
+      " 79%|████████████████████████████████████████████████████████████▋                | 1757/2227 [00:18<00:03, 155.76it/s]\u001b[A\n",
+      " 80%|█████████████████████████████████████████████████████████████▋               | 1783/2227 [00:18<00:02, 176.00it/s]\u001b[A\n",
+      " 81%|██████████████████████████████████████████████████████████████▎              | 1803/2227 [00:19<00:02, 144.49it/s]\u001b[A\n",
+      " 82%|██████████████████████████████████████████████████████████████▉              | 1820/2227 [00:19<00:03, 134.51it/s]\u001b[A\n",
+      " 83%|███████████████████████████████████████████████████████████████▌             | 1840/2227 [00:19<00:02, 148.72it/s]\u001b[A\n",
+      " 83%|████████████████████████████████████████████████████████████████▎            | 1859/2227 [00:19<00:02, 158.07it/s]\u001b[A\n",
+      " 85%|█████████████████████████████████████████████████████████████████            | 1883/2227 [00:19<00:01, 174.77it/s]\u001b[A\n",
+      " 85%|█████████████████████████████████████████████████████████████████▊           | 1903/2227 [00:19<00:02, 133.57it/s]\u001b[A\n",
+      " 86%|██████████████████████████████████████████████████████████████████▎          | 1919/2227 [00:19<00:02, 128.47it/s]\u001b[A\n",
+      " 87%|██████████████████████████████████████████████████████████████████▊          | 1934/2227 [00:20<00:02, 127.47it/s]\u001b[A\n",
+      " 88%|███████████████████████████████████████████████████████████████████▍         | 1949/2227 [00:20<00:02, 126.35it/s]\u001b[A\n",
+      " 88%|████████████████████████████████████████████████████████████████████         | 1969/2227 [00:20<00:01, 141.48it/s]\u001b[A\n",
+      " 89%|████████████████████████████████████████████████████████████████████▊        | 1992/2227 [00:20<00:01, 159.94it/s]\u001b[A\n",
+      " 90%|█████████████████████████████████████████████████████████████████████▌       | 2013/2227 [00:20<00:01, 171.16it/s]\u001b[A\n",
+      " 92%|██████████████████████████████████████████████████████████████████████▍      | 2039/2227 [00:20<00:00, 190.33it/s]\u001b[A\n",
+      " 93%|███████████████████████████████████████████████████████████████████████▏     | 2060/2227 [00:20<00:00, 194.50it/s]\u001b[A\n",
+      " 93%|███████████████████████████████████████████████████████████████████████▉     | 2081/2227 [00:20<00:00, 187.83it/s]\u001b[A\n",
+      " 94%|████████████████████████████████████████████████████████████████████████▋    | 2101/2227 [00:20<00:00, 177.27it/s]\u001b[A\n",
+      " 95%|█████████████████████████████████████████████████████████████████████████▎   | 2120/2227 [00:20<00:00, 176.31it/s]\u001b[A\n",
+      " 96%|██████████████████████████████████████████████████████████████████████████   | 2142/2227 [00:21<00:00, 185.80it/s]\u001b[A\n",
+      " 97%|██████████████████████████████████████████████████████████████████████████▊  | 2162/2227 [00:21<00:00, 177.11it/s]\u001b[A\n",
+      " 98%|███████████████████████████████████████████████████████████████████████████▌ | 2186/2227 [00:21<00:00, 191.89it/s]\u001b[A\n",
+      "100%|█████████████████████████████████████████████████████████████████████████████| 2227/2227 [00:21<00:00, 103.56it/s]\u001b[A\n"
+     ]
+    }
+   ],
+   "source": [
+    "label_functions_list = [contains_reduction_tokens,\n",
+    "                        contains_reduction_tokens_text_between,\n",
+    "                        contains_negative_corrrelation_regex,\n",
+    "                        contains_increase_decrease_pattern,\n",
+    "                        contains_increase_tokens,\n",
+    "                        contains_increase_tokens_text_between,\n",
+    "                        contains_positive_corrrelation_regex,\n",
+    "                        contains_increase_increase_pattern\n",
+    "                       ]\n",
+    "\n",
+    "applier = PandasLFApplier(label_functions_list)\n",
+    "\n",
+    "label_matrix_train = applier.apply(df_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.6 Examining the quality of the labels"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 148,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>j</th>\n",
+       "      <th>Polarity</th>\n",
+       "      <th>Coverage</th>\n",
+       "      <th>Overlaps</th>\n",
+       "      <th>Conflicts</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>contains_reduction_tokens</td>\n",
+       "      <td>0</td>\n",
+       "      <td>[1]</td>\n",
+       "      <td>0.057027</td>\n",
+       "      <td>0.019308</td>\n",
+       "      <td>0.016165</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>contains_reduction_tokens_text_between</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[1]</td>\n",
+       "      <td>0.004041</td>\n",
+       "      <td>0.004041</td>\n",
+       "      <td>0.000898</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>contains_negative_corrrelation_regex</td>\n",
+       "      <td>2</td>\n",
+       "      <td>[1]</td>\n",
+       "      <td>0.008532</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>contains_increase_decrease_pattern</td>\n",
+       "      <td>3</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>contains_increase_tokens</td>\n",
+       "      <td>4</td>\n",
+       "      <td>[0]</td>\n",
+       "      <td>0.130220</td>\n",
+       "      <td>0.016165</td>\n",
+       "      <td>0.016165</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>contains_increase_tokens_text_between</td>\n",
+       "      <td>5</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>contains_positive_corrrelation_regex</td>\n",
+       "      <td>6</td>\n",
+       "      <td>[1]</td>\n",
+       "      <td>0.002245</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>contains_increase_increase_pattern</td>\n",
+       "      <td>7</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                        j Polarity  Coverage  Overlaps  \\\n",
+       "contains_reduction_tokens               0      [1]  0.057027  0.019308   \n",
+       "contains_reduction_tokens_text_between  1      [1]  0.004041  0.004041   \n",
+       "contains_negative_corrrelation_regex    2      [1]  0.008532  0.000000   \n",
+       "contains_increase_decrease_pattern      3       []  0.000000  0.000000   \n",
+       "contains_increase_tokens                4      [0]  0.130220  0.016165   \n",
+       "contains_increase_tokens_text_between   5       []  0.000000  0.000000   \n",
+       "contains_positive_corrrelation_regex    6      [1]  0.002245  0.000000   \n",
+       "contains_increase_increase_pattern      7       []  0.000000  0.000000   \n",
+       "\n",
+       "                                        Conflicts  \n",
+       "contains_reduction_tokens                0.016165  \n",
+       "contains_reduction_tokens_text_between   0.000898  \n",
+       "contains_negative_corrrelation_regex     0.000000  \n",
+       "contains_increase_decrease_pattern       0.000000  \n",
+       "contains_increase_tokens                 0.016165  \n",
+       "contains_increase_tokens_text_between    0.000000  \n",
+       "contains_positive_corrrelation_regex     0.000000  \n",
+       "contains_increase_increase_pattern       0.000000  "
+      ]
+     },
+     "execution_count": 148,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#in the absence of a benchmark to compare against\n",
+    "LFAnalysis(L=label_matrix_train, lfs=label_functions_list).lf_summary()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 149,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>j</th>\n",
+       "      <th>Polarity</th>\n",
+       "      <th>Coverage</th>\n",
+       "      <th>Overlaps</th>\n",
+       "      <th>Conflicts</th>\n",
+       "      <th>Correct</th>\n",
+       "      <th>Incorrect</th>\n",
+       "      <th>Emp. Acc.</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>contains_reduction_tokens</td>\n",
+       "      <td>0</td>\n",
+       "      <td>[1]</td>\n",
+       "      <td>0.057027</td>\n",
+       "      <td>0.019308</td>\n",
+       "      <td>0.016165</td>\n",
+       "      <td>69</td>\n",
+       "      <td>58</td>\n",
+       "      <td>0.543307</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>contains_reduction_tokens_text_between</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[1]</td>\n",
+       "      <td>0.004041</td>\n",
+       "      <td>0.004041</td>\n",
+       "      <td>0.000898</td>\n",
+       "      <td>8</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.888889</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>contains_negative_corrrelation_regex</td>\n",
+       "      <td>2</td>\n",
+       "      <td>[1]</td>\n",
+       "      <td>0.008532</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>18</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.947368</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>contains_increase_decrease_pattern</td>\n",
+       "      <td>3</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>contains_increase_tokens</td>\n",
+       "      <td>4</td>\n",
+       "      <td>[0]</td>\n",
+       "      <td>0.130220</td>\n",
+       "      <td>0.016165</td>\n",
+       "      <td>0.016165</td>\n",
+       "      <td>263</td>\n",
+       "      <td>27</td>\n",
+       "      <td>0.906897</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>contains_increase_tokens_text_between</td>\n",
+       "      <td>5</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>contains_positive_corrrelation_regex</td>\n",
+       "      <td>6</td>\n",
+       "      <td>[1]</td>\n",
+       "      <td>0.002245</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0</td>\n",
+       "      <td>5</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>contains_increase_increase_pattern</td>\n",
+       "      <td>7</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                        j Polarity  Coverage  Overlaps  \\\n",
+       "contains_reduction_tokens               0      [1]  0.057027  0.019308   \n",
+       "contains_reduction_tokens_text_between  1      [1]  0.004041  0.004041   \n",
+       "contains_negative_corrrelation_regex    2      [1]  0.008532  0.000000   \n",
+       "contains_increase_decrease_pattern      3       []  0.000000  0.000000   \n",
+       "contains_increase_tokens                4      [0]  0.130220  0.016165   \n",
+       "contains_increase_tokens_text_between   5       []  0.000000  0.000000   \n",
+       "contains_positive_corrrelation_regex    6      [1]  0.002245  0.000000   \n",
+       "contains_increase_increase_pattern      7       []  0.000000  0.000000   \n",
+       "\n",
+       "                                        Conflicts  Correct  Incorrect  \\\n",
+       "contains_reduction_tokens                0.016165       69         58   \n",
+       "contains_reduction_tokens_text_between   0.000898        8          1   \n",
+       "contains_negative_corrrelation_regex     0.000000       18          1   \n",
+       "contains_increase_decrease_pattern       0.000000        0          0   \n",
+       "contains_increase_tokens                 0.016165      263         27   \n",
+       "contains_increase_tokens_text_between    0.000000        0          0   \n",
+       "contains_positive_corrrelation_regex     0.000000        0          5   \n",
+       "contains_increase_increase_pattern       0.000000        0          0   \n",
+       "\n",
+       "                                        Emp. Acc.  \n",
+       "contains_reduction_tokens                0.543307  \n",
+       "contains_reduction_tokens_text_between   0.888889  \n",
+       "contains_negative_corrrelation_regex     0.947368  \n",
+       "contains_increase_decrease_pattern       0.000000  \n",
+       "contains_increase_tokens                 0.906897  \n",
+       "contains_increase_tokens_text_between    0.000000  \n",
+       "contains_positive_corrrelation_regex     0.000000  \n",
+       "contains_increase_increase_pattern       0.000000  "
+      ]
+     },
+     "execution_count": 149,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#examining the quality of the labels in the presence of a benchmark to compare against\n",
+    "LFAnalysis(L=label_matrix_train, lfs=label_functions_list).lf_summary(y_train.values)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 150,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>0</td>\n",
+       "      <td>While blocking TPC2 activity by tetrandrine, a...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>Thyroid stimulating hormone and free triiodoth...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>The administration of methylprednisolone appea...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>7</td>\n",
+       "      <td>Consistent with previous reports, 20mM NH4Cl a...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>12</td>\n",
+       "      <td>Consistent with previous reports, 20mM NH4Cl a...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1644</td>\n",
+       "      <td>Actual bicarbonate and total carbon dioxide co...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1655</td>\n",
+       "      <td>Albumin concentrations were significantly lowe...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1657</td>\n",
+       "      <td>Moreover, the frequencies of regulatory T cell...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1658</td>\n",
+       "      <td>The reduced expressions of interferon-γ (IFN-γ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1668</td>\n",
+       "      <td>Spleen atrophy was observed in all reported ca...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>127 rows × 1 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                   text\n",
+       "0     While blocking TPC2 activity by tetrandrine, a...\n",
+       "2     Thyroid stimulating hormone and free triiodoth...\n",
+       "3     The administration of methylprednisolone appea...\n",
+       "7     Consistent with previous reports, 20mM NH4Cl a...\n",
+       "12    Consistent with previous reports, 20mM NH4Cl a...\n",
+       "...                                                 ...\n",
+       "1644  Actual bicarbonate and total carbon dioxide co...\n",
+       "1655  Albumin concentrations were significantly lowe...\n",
+       "1657  Moreover, the frequencies of regulatory T cell...\n",
+       "1658  The reduced expressions of interferon-γ (IFN-γ...\n",
+       "1668  Spleen atrophy was observed in all reported ca...\n",
+       "\n",
+       "[127 rows x 1 columns]"
+      ]
+     },
+     "execution_count": 150,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#examine which sentences were picked up as showing negative correlation by each label function\n",
+    "df_train.iloc[label_matrix_train[:, 0] == FOUND]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.7 Predict the final label\n",
+    "\n",
+    "Different models can be used to create the final model that aggrateges the different label functions to perdict the final lebel."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 151,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\Cafral\\Anaconda3\\lib\\site-packages\\tqdm\\std.py:658: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n",
+      "  from pandas import Panel\n",
+      "\n",
+      "  0%|                                                                                          | 0/557 [00:00<?, ?it/s]\u001b[A\n",
+      "  5%|███▉                                                                            | 27/557 [00:00<00:01, 266.68it/s]\u001b[A\n",
+      "  7%|█████▌                                                                          | 39/557 [00:00<00:02, 192.85it/s]\u001b[A\n",
+      "  9%|███████▍                                                                        | 52/557 [00:00<00:03, 166.70it/s]\u001b[A\n",
+      " 12%|█████████▍                                                                      | 66/557 [00:00<00:03, 156.09it/s]\u001b[A\n",
+      " 14%|███████████▎                                                                    | 79/557 [00:00<00:03, 138.04it/s]\u001b[A\n",
+      " 16%|█████████████                                                                   | 91/557 [00:00<00:03, 130.99it/s]\u001b[A\n",
+      " 19%|███████████████▏                                                               | 107/557 [00:00<00:03, 137.18it/s]\u001b[A\n",
+      " 22%|█████████████████▌                                                             | 124/557 [00:00<00:03, 144.01it/s]\u001b[A\n",
+      " 26%|████████████████████▎                                                          | 143/557 [00:00<00:02, 154.01it/s]\u001b[A\n",
+      " 29%|██████████████████████▌                                                        | 159/557 [00:01<00:02, 139.24it/s]\u001b[A\n",
+      " 31%|████████████████████████▌                                                      | 173/557 [00:01<00:02, 135.02it/s]\u001b[A\n",
+      " 34%|██████████████████████████▌                                                    | 187/557 [00:01<00:02, 130.39it/s]\u001b[A\n",
+      " 36%|████████████████████████████▌                                                  | 201/557 [00:01<00:02, 129.75it/s]\u001b[A\n",
+      " 39%|███████████████████████████████▏                                               | 220/557 [00:01<00:02, 142.39it/s]\u001b[A\n",
+      " 43%|██████████████████████████████████▏                                            | 241/557 [00:01<00:02, 152.56it/s]\u001b[A\n",
+      " 46%|████████████████████████████████████▍                                          | 257/557 [00:01<00:02, 145.73it/s]\u001b[A\n",
+      " 49%|██████████████████████████████████████▌                                        | 272/557 [00:01<00:02, 133.97it/s]\u001b[A\n",
+      " 51%|████████████████████████████████████████▌                                      | 286/557 [00:02<00:02, 126.53it/s]\u001b[A\n",
+      " 54%|██████████████████████████████████████████▋                                    | 301/557 [00:02<00:01, 132.39it/s]\u001b[A\n",
+      " 57%|█████████████████████████████████████████████▏                                 | 319/557 [00:02<00:01, 142.70it/s]\u001b[A\n",
+      " 61%|████████████████████████████████████████████████                               | 339/557 [00:02<00:01, 155.15it/s]\u001b[A\n",
+      " 64%|██████████████████████████████████████████████████▍                            | 356/557 [00:02<00:01, 153.73it/s]\u001b[A\n",
+      " 67%|████████████████████████████████████████████████████▊                          | 372/557 [00:02<00:01, 155.09it/s]\u001b[A\n",
+      " 70%|███████████████████████████████████████████████████████▏                       | 389/557 [00:02<00:01, 158.37it/s]\u001b[A\n",
+      " 73%|█████████████████████████████████████████████████████████▌                     | 406/557 [00:02<00:00, 159.67it/s]\u001b[A\n",
+      " 76%|███████████████████████████████████████████████████████████▉                   | 423/557 [00:02<00:00, 158.73it/s]\u001b[A\n",
+      " 79%|██████████████████████████████████████████████████████████████▍                | 440/557 [00:02<00:00, 159.66it/s]\u001b[A\n",
+      " 82%|████████████████████████████████████████████████████████████████▊              | 457/557 [00:03<00:00, 155.04it/s]\u001b[A\n",
+      " 87%|████████████████████████████████████████████████████████████████████▎          | 482/557 [00:03<00:00, 174.46it/s]\u001b[A\n",
+      " 90%|███████████████████████████████████████████████████████████████████████        | 501/557 [00:03<00:00, 169.50it/s]\u001b[A\n",
+      " 93%|█████████████████████████████████████████████████████████████████████████▌     | 519/557 [00:03<00:00, 151.16it/s]\u001b[A\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████| 557/557 [00:03<00:00, 152.95it/s]\u001b[A\n"
+     ]
+    }
+   ],
+   "source": [
+    "#testing data\n",
+    "label_matrix_test = applier.apply(df_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 152,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Model 1 : majority model (mm)\n",
+    "majority_model = MajorityLabelVoter()\n",
+    "\n",
+    "#training data\n",
+    "mm_preds_class_train = majority_model.predict(L=label_matrix_train)\n",
+    "mm_preds_proba_train = majority_model.predict_proba(L=label_matrix_train)\n",
+    "\n",
+    "#testing data\n",
+    "mm_preds_class_test = majority_model.predict(L=label_matrix_test)\n",
+    "mm_preds_proba_test = majority_model.predict_proba(L=label_matrix_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 153,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([ 1, -1,  1, ..., -1, -1, -1])"
+      ]
+     },
+     "execution_count": 153,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mm_preds_class_train # only the 1s and 0s are labels. T-1s are abstains i.e. unlabeled data points"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 154,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[0. , 1. ],\n",
+       "       [0.5, 0.5],\n",
+       "       [0. , 1. ],\n",
+       "       ...,\n",
+       "       [0.5, 0.5],\n",
+       "       [0.5, 0.5],\n",
+       "       [0.5, 0.5]])"
+      ]
+     },
+     "execution_count": 154,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mm_preds_proba_train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 155,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Model 2:label model (lm)\n",
+    "\n",
+    "#call the model\n",
+    "label_model = LabelModel(cardinality=2, verbose=True)\n",
+    "\n",
+    "#fit the model\n",
+    "num_epochs = 1000\n",
+    "log_frequency = 100\n",
+    "random_seed = 1\n",
+    "label_model.fit(L_train=label_matrix_train, n_epochs=num_epochs, log_freq=log_frequency, seed=random_seed)\n",
+    "\n",
+    "#generate lables for training data\n",
+    "lm_preds_proba_train = label_model.predict_proba(label_matrix_train)\n",
+    "lm_preds_class_train = probs_to_preds(lm_preds_proba_train)\n",
+    "\n",
+    "#generate labels for testing data\n",
+    "lm_preds_proba_test = label_model.predict_proba(label_matrix_test)\n",
+    "lm_preds_class_test = probs_to_preds(lm_preds_proba_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 156,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Model 3 : Random Voter (rv)\n",
+    "\n",
+    "random_voter = RandomVoter()\n",
+    "\n",
+    "#training data\n",
+    "rv_preds_class_train = random_voter.predict(L=label_matrix_train)\n",
+    "rv_preds_proba_train = random_voter.predict_proba(L=label_matrix_train)\n",
+    "\n",
+    "#testing data\n",
+    "rv_preds_class_test = random_voter.predict(L=label_matrix_test)\n",
+    "rv_preds_proba_test = random_voter.predict_proba(L=label_matrix_test)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.8 Comparing different models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 157,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Majority Model \n",
+      " Accuracy: \n",
+      " train-> 0.5635383924562192 \n",
+      " test-> 0.4703770197486535 \n",
+      " AUC: \n",
+      " train-> 0.6010212548732079 \n",
+      " test-> 0.5227059436913452 \n",
+      "\n",
+      "Label Model \n",
+      " Accuracy: \n",
+      " train-> 0.5527615626403233 \n",
+      " test-> 0.49012567324955114 \n",
+      " AUC: \n",
+      " train-> 0.524345344386498 \n",
+      " test-> 0.4437434827945777 \n",
+      "\n",
+      "Random Voter Model \n",
+      " Accuracy: \n",
+      " train-> 0.5024696901661428 \n",
+      " test-> 0.5008976660682226 \n",
+      " AUC: \n",
+      " train-> 0.5126309212678816 \n",
+      " test-> 0.5023114355231144 \n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "all_models = {'Majority Model':majority_model,\n",
+    "              'Label Model':label_model,\n",
+    "              'Random Voter Model':random_voter}\n",
+    "\n",
+    "for model_name,model in all_models.items():\n",
+    "    \n",
+    "    #accuracy\n",
+    "    train_acc = model.score(L=label_matrix_train, Y=y_train, tie_break_policy=\"random\")[\"accuracy\"]\n",
+    "    test_acc = model.score(L=label_matrix_test, Y=y_test, tie_break_policy=\"random\")[\"accuracy\"]\n",
+    "    \n",
+    "    #auc\n",
+    "    train_auc = metric_score(y_train, probs=model.predict_proba(L=label_matrix_train), metric='roc_auc')\n",
+    "    test_auc = metric_score(y_test, probs=model.predict_proba(L=label_matrix_test), metric='roc_auc')\n",
+    "    \n",
+    "    print(f'{model_name}','\\n',\n",
+    "          'Accuracy:','\\n','train->',train_acc,'\\n','test->',test_acc,'\\n',\n",
+    "          'AUC:','\\n','train->',train_auc,'\\n','test->',test_auc,'\\n')\n",
+    "    \n",
+    "    \n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The label model has the highest test AUC so that's the  best model."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 3. Filter out unlabeled points"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 169,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total points labelled in training data: 405\n",
+      "Total points labelled in testing data: 91\n"
+     ]
+    }
+   ],
+   "source": [
+    "#training labels\n",
+    "df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(\n",
+    "    X=df_train['text'], \n",
+    "    y=lm_preds_proba_train, \n",
+    "    L=label_matrix_train\n",
+    ")\n",
+    "\n",
+    "#testing labels\n",
+    "df_test_filtered, probs_test_filtered = filter_unlabeled_dataframe(\n",
+    "    X=df_test['text'], \n",
+    "    y=lm_preds_proba_test, \n",
+    "    L=label_matrix_test\n",
+    ")\n",
+    "\n",
+    "print('Total points labelled in training data:',len(df_train_filtered))\n",
+    "print('Total points labelled in testing data:',len(df_test_filtered))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# References\n",
+    "\n",
+    "https://www.snorkel.org/use-cases/spouse-demo\n",
+    "    \n",
+    "https://github.com/snorkel-team/snorkel-tutorials/blob/master/spouse/spouse_demo.ipynb\n",
+    "    \n",
+    "https://www.snorkel.org/use-cases/01-spam-tutorial\n",
+    "    \n",
+    "https://readthedocs.org/projects/snorkel/downloads/pdf/master/"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 68a1ce1d76c6cd16d88c64166d482742f7d81377 Mon Sep 17 00:00:00 2001
From: Kriti Mahajan <kriti.mahajan.13@gmail.com>
Date: Wed, 1 Jul 2020 20:04:17 +0530
Subject: [PATCH 2/3] Delete Snorkel RE example.ipynb

---
 .../notebooks/Snorkel RE example.ipynb        | 2187 -----------------
 1 file changed, 2187 deletions(-)
 delete mode 100644 immunology_kg/notebooks/Snorkel RE example.ipynb

diff --git a/immunology_kg/notebooks/Snorkel RE example.ipynb b/immunology_kg/notebooks/Snorkel RE example.ipynb
deleted file mode 100644
index d44c4a9..0000000
--- a/immunology_kg/notebooks/Snorkel RE example.ipynb	
+++ /dev/null
@@ -1,2187 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import snorkel\n",
-    "\n",
-    "from snorkel.preprocess import preprocessor\n",
-    "from snorkel.preprocess.nlp import SpacyPreprocessor\n",
-    "from snorkel.types import DataPoint\n",
-    "\n",
-    "from snorkel.labeling.lf.nlp import nlp_labeling_function\n",
-    "from snorkel.labeling import PandasLFApplier,filter_unlabeled_dataframe,LFAnalysis ,labeling_function\n",
-    "from snorkel.labeling.model import MajorityClassVoter,MajorityLabelVoter,RandomVoter ,LabelModel\n",
-    "\n",
-    "from snorkel.analysis import metric_score , get_label_buckets\n",
-    "\n",
-    "from snorkel.utils import probs_to_preds\n",
-    "\n",
-    "from sklearn.model_selection import train_test_split\n",
-    "\n",
-    "import pandas as pd\n",
-    "import re\n",
-    "import os\n",
-    "from collections import OrderedDict"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 1. Load the data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>Unnamed: 0</th>\n",
-       "      <th>text</th>\n",
-       "      <th>source</th>\n",
-       "      <th>relation</th>\n",
-       "      <th>target</th>\n",
-       "      <th>link</th>\n",
-       "      <th>pmc_id</th>\n",
-       "      <th>doi_id</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>While blocking TPC2 activity by tetrandrine, a...</td>\n",
-       "      <td>{'(+)-Tetrandrine': {'namespace': 'chebi', 'na...</td>\n",
-       "      <td>negativeCorrelation</td>\n",
-       "      <td>{'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...</td>\n",
-       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
-       "      <td>32221306.0</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "      <td>Chemoinformatics searches yielded 15 approved ...</td>\n",
-       "      <td>{'(S)-verapamil': {'namespace': 'chebi', 'name...</td>\n",
-       "      <td>negativeCorrelation</td>\n",
-       "      <td>{'hypertension': {'namespace': 'doid', 'name':...</td>\n",
-       "      <td>{'annotations': {}, 'citation': {'db': 'DOI', ...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>https://doi.org/10.1101/2020.03.22.002386</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>2</td>\n",
-       "      <td>2</td>\n",
-       "      <td>Thyroid stimulating hormone and free triiodoth...</td>\n",
-       "      <td>{\"3,3',5'-triiodothyronine\": {'namespace': 'ch...</td>\n",
-       "      <td>negativeCorrelation</td>\n",
-       "      <td>{'COVID-19': {'namespace': 'doid', 'name': 'CO...</td>\n",
-       "      <td>{'annotations': {'mesh': {'D044967': True}}, '...</td>\n",
-       "      <td>32217556.0</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>3</td>\n",
-       "      <td>3</td>\n",
-       "      <td>Based on these results, we performed virtual d...</td>\n",
-       "      <td>{\"4'-epidoxorubicin\": {'namespace': 'chebi', '...</td>\n",
-       "      <td>decreases</td>\n",
-       "      <td>{'3.4.22.69': {'namespace': 'eccode', 'name': ...</td>\n",
-       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
-       "      <td>32173287.0</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>4</td>\n",
-       "      <td>4</td>\n",
-       "      <td>Doctors can also use a clinically approved bil...</td>\n",
-       "      <td>{'4-methylumbelliferone': {'namespace': 'chebi...</td>\n",
-       "      <td>decreases</td>\n",
-       "      <td>{'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',...</td>\n",
-       "      <td>{'annotations': {'mesh': {'D008168': True}}, '...</td>\n",
-       "      <td>32205856.0</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>5</td>\n",
-       "      <td>5</td>\n",
-       "      <td>Since Vitamin B3 is highly lung protective, it...</td>\n",
-       "      <td>{'4-methylumbelliferone': {'namespace': 'chebi...</td>\n",
-       "      <td>decreases</td>\n",
-       "      <td>{'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',...</td>\n",
-       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
-       "      <td>32205856.0</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>6</td>\n",
-       "      <td>6</td>\n",
-       "      <td>Doctors can also use a clinically approved bil...</td>\n",
-       "      <td>{'4-methylumbelliferone': {'namespace': 'chebi...</td>\n",
-       "      <td>decreases</td>\n",
-       "      <td>{'inflammatory response': {'namespace': 'go', ...</td>\n",
-       "      <td>{'annotations': {'mesh': {'D008168': True}}, '...</td>\n",
-       "      <td>32205856.0</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   Unnamed: 0                                               text  \\\n",
-       "0           0  While blocking TPC2 activity by tetrandrine, a...   \n",
-       "1           1  Chemoinformatics searches yielded 15 approved ...   \n",
-       "2           2  Thyroid stimulating hormone and free triiodoth...   \n",
-       "3           3  Based on these results, we performed virtual d...   \n",
-       "4           4  Doctors can also use a clinically approved bil...   \n",
-       "5           5  Since Vitamin B3 is highly lung protective, it...   \n",
-       "6           6  Doctors can also use a clinically approved bil...   \n",
-       "\n",
-       "                                              source             relation  \\\n",
-       "0  {'(+)-Tetrandrine': {'namespace': 'chebi', 'na...  negativeCorrelation   \n",
-       "1  {'(S)-verapamil': {'namespace': 'chebi', 'name...  negativeCorrelation   \n",
-       "2  {\"3,3',5'-triiodothyronine\": {'namespace': 'ch...  negativeCorrelation   \n",
-       "3  {\"4'-epidoxorubicin\": {'namespace': 'chebi', '...            decreases   \n",
-       "4  {'4-methylumbelliferone': {'namespace': 'chebi...            decreases   \n",
-       "5  {'4-methylumbelliferone': {'namespace': 'chebi...            decreases   \n",
-       "6  {'4-methylumbelliferone': {'namespace': 'chebi...            decreases   \n",
-       "\n",
-       "                                              target  \\\n",
-       "0  {'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...   \n",
-       "1  {'hypertension': {'namespace': 'doid', 'name':...   \n",
-       "2  {'COVID-19': {'namespace': 'doid', 'name': 'CO...   \n",
-       "3  {'3.4.22.69': {'namespace': 'eccode', 'name': ...   \n",
-       "4  {'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',...   \n",
-       "5  {'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',...   \n",
-       "6  {'inflammatory response': {'namespace': 'go', ...   \n",
-       "\n",
-       "                                                link      pmc_id  \\\n",
-       "0  {'annotations': {}, 'citation': {'authors': ['...  32221306.0   \n",
-       "1  {'annotations': {}, 'citation': {'db': 'DOI', ...         NaN   \n",
-       "2  {'annotations': {'mesh': {'D044967': True}}, '...  32217556.0   \n",
-       "3  {'annotations': {}, 'citation': {'authors': ['...  32173287.0   \n",
-       "4  {'annotations': {'mesh': {'D008168': True}}, '...  32205856.0   \n",
-       "5  {'annotations': {}, 'citation': {'authors': ['...  32205856.0   \n",
-       "6  {'annotations': {'mesh': {'D008168': True}}, '...  32205856.0   \n",
-       "\n",
-       "                                      doi_id  \n",
-       "0                                        NaN  \n",
-       "1  https://doi.org/10.1101/2020.03.22.002386  \n",
-       "2                                        NaN  \n",
-       "3                                        NaN  \n",
-       "4                                        NaN  \n",
-       "5                                        NaN  \n",
-       "6                                        NaN  "
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "#'https://raw.githubusercontent.com/covid19kg/covid19kg/master/supplement/terminology.csv'\n",
-    "url = 'https://raw.githubusercontent.com/CoronaWhy/task-vt/kaleidoescape_kg/immunology_kg/relations/covid19_frauenhofer_annotations.csv'\n",
-    "pybel_pd = pd.read_csv(url)\n",
-    "pybel_pd.head(7)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array(['negativeCorrelation', 'decreases', 'regulates', 'increases',\n",
-       "       'positiveCorrelation', 'association', 'isA', 'biomarkerFor',\n",
-       "       'prognosticBiomarkerFor', 'causesNoChange'], dtype=object)"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "#list all types of relations\n",
-    "relation_categories = pybel_pd['relation'].unique()\n",
-    "relation_categories"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 2. Snorkel Example\n",
-    "\n",
-    "For the purpose of this example we'll only focus on rows with 'negativeCorrelation' and 'positiveCorrelation' as their relations."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 131,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "C:\\Users\\Cafral\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
-      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
-      "Try using .loc[row_indexer,col_indexer] = value instead\n",
-      "\n",
-      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
-      "  \n",
-      "C:\\Users\\Cafral\\Anaconda3\\lib\\site-packages\\pandas\\core\\frame.py:4102: SettingWithCopyWarning: \n",
-      "A value is trying to be set on a copy of a slice from a DataFrame\n",
-      "\n",
-      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
-      "  errors=errors,\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>text</th>\n",
-       "      <th>source</th>\n",
-       "      <th>relation</th>\n",
-       "      <th>target</th>\n",
-       "      <th>link</th>\n",
-       "      <th>pmc_id</th>\n",
-       "      <th>doi_id</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <td>0</td>\n",
-       "      <td>While blocking TPC2 activity by tetrandrine, a...</td>\n",
-       "      <td>{'(+)-Tetrandrine': {'namespace': 'chebi', 'na...</td>\n",
-       "      <td>True</td>\n",
-       "      <td>{'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...</td>\n",
-       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
-       "      <td>32221306.0</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>1</td>\n",
-       "      <td>Chemoinformatics searches yielded 15 approved ...</td>\n",
-       "      <td>{'(S)-verapamil': {'namespace': 'chebi', 'name...</td>\n",
-       "      <td>True</td>\n",
-       "      <td>{'hypertension': {'namespace': 'doid', 'name':...</td>\n",
-       "      <td>{'annotations': {}, 'citation': {'db': 'DOI', ...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>https://doi.org/10.1101/2020.03.22.002386</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>2</td>\n",
-       "      <td>Thyroid stimulating hormone and free triiodoth...</td>\n",
-       "      <td>{\"3,3',5'-triiodothyronine\": {'namespace': 'ch...</td>\n",
-       "      <td>True</td>\n",
-       "      <td>{'COVID-19': {'namespace': 'doid', 'name': 'CO...</td>\n",
-       "      <td>{'annotations': {'mesh': {'D044967': True}}, '...</td>\n",
-       "      <td>32217556.0</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>3</td>\n",
-       "      <td>The administration of methylprednisolone appea...</td>\n",
-       "      <td>{'6-methylprednisolone': {'namespace': 'chebi'...</td>\n",
-       "      <td>True</td>\n",
-       "      <td>{'Death': {'namespace': 'mesh', 'name': 'Death...</td>\n",
-       "      <td>{'annotations': {'doid': {'11394': True}}, 'ci...</td>\n",
-       "      <td>32167524.0</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>4</td>\n",
-       "      <td>Adverse reactions of IFN-α mainly include low-...</td>\n",
-       "      <td>{'Interferon alfa-2a': {'namespace': 'chebi', ...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>{'Low-grade fever': {'namespace': 'hp', 'name'...</td>\n",
-       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
-       "      <td>32166483.0</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                                text  \\\n",
-       "0  While blocking TPC2 activity by tetrandrine, a...   \n",
-       "1  Chemoinformatics searches yielded 15 approved ...   \n",
-       "2  Thyroid stimulating hormone and free triiodoth...   \n",
-       "3  The administration of methylprednisolone appea...   \n",
-       "4  Adverse reactions of IFN-α mainly include low-...   \n",
-       "\n",
-       "                                              source  relation  \\\n",
-       "0  {'(+)-Tetrandrine': {'namespace': 'chebi', 'na...      True   \n",
-       "1  {'(S)-verapamil': {'namespace': 'chebi', 'name...      True   \n",
-       "2  {\"3,3',5'-triiodothyronine\": {'namespace': 'ch...      True   \n",
-       "3  {'6-methylprednisolone': {'namespace': 'chebi'...      True   \n",
-       "4  {'Interferon alfa-2a': {'namespace': 'chebi', ...     False   \n",
-       "\n",
-       "                                              target  \\\n",
-       "0  {'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...   \n",
-       "1  {'hypertension': {'namespace': 'doid', 'name':...   \n",
-       "2  {'COVID-19': {'namespace': 'doid', 'name': 'CO...   \n",
-       "3  {'Death': {'namespace': 'mesh', 'name': 'Death...   \n",
-       "4  {'Low-grade fever': {'namespace': 'hp', 'name'...   \n",
-       "\n",
-       "                                                link      pmc_id  \\\n",
-       "0  {'annotations': {}, 'citation': {'authors': ['...  32221306.0   \n",
-       "1  {'annotations': {}, 'citation': {'db': 'DOI', ...         NaN   \n",
-       "2  {'annotations': {'mesh': {'D044967': True}}, '...  32217556.0   \n",
-       "3  {'annotations': {'doid': {'11394': True}}, 'ci...  32167524.0   \n",
-       "4  {'annotations': {}, 'citation': {'authors': ['...  32166483.0   \n",
-       "\n",
-       "                                      doi_id  \n",
-       "0                                        NaN  \n",
-       "1  https://doi.org/10.1101/2020.03.22.002386  \n",
-       "2                                        NaN  \n",
-       "3                                        NaN  \n",
-       "4                                        NaN  "
-      ]
-     },
-     "execution_count": 131,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "example_data = pybel_pd[(pybel_pd['relation']=='negativeCorrelation') | (pybel_pd['relation']=='positiveCorrelation') ]\n",
-    "example_data['relation'] = example_data['relation']=='negativeCorrelation'\n",
-    "example_data.reset_index(inplace=True,drop=True)\n",
-    "example_data.drop('Unnamed: 0',inplace=True,axis=1)\n",
-    "example_data.head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 2.1 Split the data into training and testing \n",
-    "\n",
-    "Ideally should have training , validation and testing set. Also, here , I'm using a fixed testing period but k-fold cross validation techniques are a more robust way of determining the accuracy of the generated labels."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 132,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_train,df_test,y_train,y_test = train_test_split(example_data[['text']],example_data[['relation']],test_size=0.20,shuffle=False)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 2.2 Reading sentences to understand syntactic differences betweem negative and positive correlation sentences\n",
-    "\n",
-    "The utility of snorkel is that it allows you to create multiple labelling functions which (try to) mimic the rules that a human annotator of data would apply while deciding how to label unlabelled data. For instance a human annotator looking to identify negative correlation is sentences will follow the following rules:\n",
-    "\n",
-    "##### A. Positive rules: rules which tell you what a sentences with negative correlation looks like\n",
-    "\n",
-    "1) does the sentence contain the reduction related words like words 'decreased','reduced','lowered'\n",
-    "\n",
-    "2) does the sentence contain the expression 'negatively correlated' or 'negative correlation' or 'negatively related' or 'inversely related' or 'inverse relation'\n",
-    "\n",
-    "3) does the syntax follow the structure 'increase in x is associated with a decrease in y' (and vice versa)\n",
-    "\n",
-    "4) does the sentence contain the expression 'negative effect'\n",
-    "\n",
-    "5) does the sentence contain the expression 'move in opposite directions'\n",
-    "\n",
-    "##### B. Negative rules: rules which tell you what a sentences without negative correlation looks like\n",
-    "\n",
-    "1) does the sentence contain the increase related words like words 'increased','improved'\n",
-    "\n",
-    "2) does the sentence contain the expression 'positively correlated' or 'positive correlation' or 'positively related'\n",
-    "\n",
-    "3) does the syntax follow the structure 'increase in x is associated with a increase in y' (and vice versa)\n",
-    "\n",
-    "4) does the sentence contain the expression 'positive effect'\n",
-    "\n",
-    "5) does the sentence contain the expression 'move in the same direction'\n",
-    "\n",
-    "These rules can be coded using snorkel. Importantly it requires both positive and negative rules. "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### 2.2.1 Examining Negative correlation sentences to understand their syntactic structure and then define labelling functions accordingly"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 133,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>text</th>\n",
-       "      <th>source</th>\n",
-       "      <th>relation</th>\n",
-       "      <th>target</th>\n",
-       "      <th>link</th>\n",
-       "      <th>pmc_id</th>\n",
-       "      <th>doi_id</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <td>0</td>\n",
-       "      <td>While blocking TPC2 activity by tetrandrine, a...</td>\n",
-       "      <td>{'(+)-Tetrandrine': {'namespace': 'chebi', 'na...</td>\n",
-       "      <td>True</td>\n",
-       "      <td>{'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...</td>\n",
-       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
-       "      <td>32221306.0</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>1</td>\n",
-       "      <td>Chemoinformatics searches yielded 15 approved ...</td>\n",
-       "      <td>{'(S)-verapamil': {'namespace': 'chebi', 'name...</td>\n",
-       "      <td>True</td>\n",
-       "      <td>{'hypertension': {'namespace': 'doid', 'name':...</td>\n",
-       "      <td>{'annotations': {}, 'citation': {'db': 'DOI', ...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>https://doi.org/10.1101/2020.03.22.002386</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>2</td>\n",
-       "      <td>Thyroid stimulating hormone and free triiodoth...</td>\n",
-       "      <td>{\"3,3',5'-triiodothyronine\": {'namespace': 'ch...</td>\n",
-       "      <td>True</td>\n",
-       "      <td>{'COVID-19': {'namespace': 'doid', 'name': 'CO...</td>\n",
-       "      <td>{'annotations': {'mesh': {'D044967': True}}, '...</td>\n",
-       "      <td>32217556.0</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>3</td>\n",
-       "      <td>The administration of methylprednisolone appea...</td>\n",
-       "      <td>{'6-methylprednisolone': {'namespace': 'chebi'...</td>\n",
-       "      <td>True</td>\n",
-       "      <td>{'Death': {'namespace': 'mesh', 'name': 'Death...</td>\n",
-       "      <td>{'annotations': {'doid': {'11394': True}}, 'ci...</td>\n",
-       "      <td>32167524.0</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>4</td>\n",
-       "      <td>In our opinion, during the COVID-19 pandemic, ...</td>\n",
-       "      <td>{'adrenergic antagonist': {'namespace': 'chebi...</td>\n",
-       "      <td>True</td>\n",
-       "      <td>{'COVID-19': {'namespace': 'doid', 'name': 'CO...</td>\n",
-       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
-       "      <td>32220710.0</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>5</td>\n",
-       "      <td>Consistent with previous reports, 20mM NH4Cl a...</td>\n",
-       "      <td>{'ammonium chloride': {'namespace': 'chebi', '...</td>\n",
-       "      <td>True</td>\n",
-       "      <td>{'G protein, vesicular stomatitis virus': {'na...</td>\n",
-       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
-       "      <td>32221306.0</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>6</td>\n",
-       "      <td>If the latter percentage would be found to be ...</td>\n",
-       "      <td>{'angiotensin receptor antagonist': {'namespac...</td>\n",
-       "      <td>True</td>\n",
-       "      <td>{'COVID-19': {'namespace': 'doid', 'name': 'CO...</td>\n",
-       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
-       "      <td>32129518.0</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>7</td>\n",
-       "      <td>Consistent with previous reports, 20mM NH4Cl a...</td>\n",
-       "      <td>{'bafilomycin A1': {'namespace': 'chebi', 'nam...</td>\n",
-       "      <td>True</td>\n",
-       "      <td>{'G protein, vesicular stomatitis virus': {'na...</td>\n",
-       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
-       "      <td>32221306.0</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                                text  \\\n",
-       "0  While blocking TPC2 activity by tetrandrine, a...   \n",
-       "1  Chemoinformatics searches yielded 15 approved ...   \n",
-       "2  Thyroid stimulating hormone and free triiodoth...   \n",
-       "3  The administration of methylprednisolone appea...   \n",
-       "4  In our opinion, during the COVID-19 pandemic, ...   \n",
-       "5  Consistent with previous reports, 20mM NH4Cl a...   \n",
-       "6  If the latter percentage would be found to be ...   \n",
-       "7  Consistent with previous reports, 20mM NH4Cl a...   \n",
-       "\n",
-       "                                              source  relation  \\\n",
-       "0  {'(+)-Tetrandrine': {'namespace': 'chebi', 'na...      True   \n",
-       "1  {'(S)-verapamil': {'namespace': 'chebi', 'name...      True   \n",
-       "2  {\"3,3',5'-triiodothyronine\": {'namespace': 'ch...      True   \n",
-       "3  {'6-methylprednisolone': {'namespace': 'chebi'...      True   \n",
-       "4  {'adrenergic antagonist': {'namespace': 'chebi...      True   \n",
-       "5  {'ammonium chloride': {'namespace': 'chebi', '...      True   \n",
-       "6  {'angiotensin receptor antagonist': {'namespac...      True   \n",
-       "7  {'bafilomycin A1': {'namespace': 'chebi', 'nam...      True   \n",
-       "\n",
-       "                                              target  \\\n",
-       "0  {'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...   \n",
-       "1  {'hypertension': {'namespace': 'doid', 'name':...   \n",
-       "2  {'COVID-19': {'namespace': 'doid', 'name': 'CO...   \n",
-       "3  {'Death': {'namespace': 'mesh', 'name': 'Death...   \n",
-       "4  {'COVID-19': {'namespace': 'doid', 'name': 'CO...   \n",
-       "5  {'G protein, vesicular stomatitis virus': {'na...   \n",
-       "6  {'COVID-19': {'namespace': 'doid', 'name': 'CO...   \n",
-       "7  {'G protein, vesicular stomatitis virus': {'na...   \n",
-       "\n",
-       "                                                link      pmc_id  \\\n",
-       "0  {'annotations': {}, 'citation': {'authors': ['...  32221306.0   \n",
-       "1  {'annotations': {}, 'citation': {'db': 'DOI', ...         NaN   \n",
-       "2  {'annotations': {'mesh': {'D044967': True}}, '...  32217556.0   \n",
-       "3  {'annotations': {'doid': {'11394': True}}, 'ci...  32167524.0   \n",
-       "4  {'annotations': {}, 'citation': {'authors': ['...  32220710.0   \n",
-       "5  {'annotations': {}, 'citation': {'authors': ['...  32221306.0   \n",
-       "6  {'annotations': {}, 'citation': {'authors': ['...  32129518.0   \n",
-       "7  {'annotations': {}, 'citation': {'authors': ['...  32221306.0   \n",
-       "\n",
-       "                                      doi_id  \n",
-       "0                                        NaN  \n",
-       "1  https://doi.org/10.1101/2020.03.22.002386  \n",
-       "2                                        NaN  \n",
-       "3                                        NaN  \n",
-       "4                                        NaN  \n",
-       "5                                        NaN  \n",
-       "6                                        NaN  \n",
-       "7                                        NaN  "
-      ]
-     },
-     "execution_count": 133,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "neg_correl_df = example_data[example_data['relation']==1]\n",
-    "neg_correl_df.reset_index(inplace=True,drop=True)\n",
-    "neg_correl_df.head(8)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 134,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'While blocking TPC2 activity by tetrandrine, an inhibitor for TPC237, decreased entry of SARS-CoV-2 S pseudovirions (Fig. 3f), treatment of cells with 130, a TRPML1 inhibitor, had no effect (Supplementary Fig. 1), indicating that TPC2, not TRPML1, is important for SARS-CoV-2 entry.'"
-      ]
-     },
-     "execution_count": 134,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "neg_correl_df['text'][0]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### 2.2.2 Examining positive correlation sentences to understand their syntactic structure and then define labelling funtions accordingly"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 135,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>text</th>\n",
-       "      <th>source</th>\n",
-       "      <th>relation</th>\n",
-       "      <th>target</th>\n",
-       "      <th>link</th>\n",
-       "      <th>pmc_id</th>\n",
-       "      <th>doi_id</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <td>0</td>\n",
-       "      <td>Adverse reactions of IFN-α mainly include low-...</td>\n",
-       "      <td>{'Interferon alfa-2a': {'namespace': 'chebi', ...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>{'Low-grade fever': {'namespace': 'hp', 'name'...</td>\n",
-       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
-       "      <td>32166483.0</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>1</td>\n",
-       "      <td>Adverse reactions of IFN-α mainly include low-...</td>\n",
-       "      <td>{'Interferon alfa-2a': {'namespace': 'chebi', ...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>{'influenza': {'namespace': 'doid', 'name': 'i...</td>\n",
-       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
-       "      <td>32166483.0</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>2</td>\n",
-       "      <td>This may be accounted for by two complementary...</td>\n",
-       "      <td>{'angiotensin II': {'namespace': 'chebi', 'nam...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>{'COVID-19': {'namespace': 'doid', 'name': 'CO...</td>\n",
-       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
-       "      <td>32129518.0</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>3</td>\n",
-       "      <td>ACE2 can also antagonize cardiac fibrosis and ...</td>\n",
-       "      <td>{'angiotensin II': {'namespace': 'chebi', 'nam...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>{'Ventricular Remodeling': {'namespace': 'mesh...</td>\n",
-       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
-       "      <td>32221983.0</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>4</td>\n",
-       "      <td>ACE2 can also antagonize cardiac fibrosis and ...</td>\n",
-       "      <td>{'angiotensin II': {'namespace': 'chebi', 'nam...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>{'Myocardial fibrosis': {'namespace': 'hp', 'n...</td>\n",
-       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
-       "      <td>32221983.0</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>5</td>\n",
-       "      <td>The existence of significantly increased fibri...</td>\n",
-       "      <td>{'Fibrin': {'namespace': 'chebi', 'name': 'Fib...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>{'Hyperfibrinolysis': {'namespace': 'hp', 'nam...</td>\n",
-       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
-       "      <td>32216698.0</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>6</td>\n",
-       "      <td>This opinion is supported by the presence of h...</td>\n",
-       "      <td>{'Fibrin': {'namespace': 'chebi', 'name': 'Fib...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>{'Hemorrhage': {'namespace': 'mesh', 'name': '...</td>\n",
-       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
-       "      <td>32216698.0</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>7</td>\n",
-       "      <td>In the influenza virus model, it was reported ...</td>\n",
-       "      <td>{'chloroquine': {'namespace': 'chebi', 'name':...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>{'dendritic cell antigen processing and presen...</td>\n",
-       "      <td>{'annotations': {'mesh': {'D007251': True}}, '...</td>\n",
-       "      <td>32171740.0</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                                text  \\\n",
-       "0  Adverse reactions of IFN-α mainly include low-...   \n",
-       "1  Adverse reactions of IFN-α mainly include low-...   \n",
-       "2  This may be accounted for by two complementary...   \n",
-       "3  ACE2 can also antagonize cardiac fibrosis and ...   \n",
-       "4  ACE2 can also antagonize cardiac fibrosis and ...   \n",
-       "5  The existence of significantly increased fibri...   \n",
-       "6  This opinion is supported by the presence of h...   \n",
-       "7  In the influenza virus model, it was reported ...   \n",
-       "\n",
-       "                                              source  relation  \\\n",
-       "0  {'Interferon alfa-2a': {'namespace': 'chebi', ...     False   \n",
-       "1  {'Interferon alfa-2a': {'namespace': 'chebi', ...     False   \n",
-       "2  {'angiotensin II': {'namespace': 'chebi', 'nam...     False   \n",
-       "3  {'angiotensin II': {'namespace': 'chebi', 'nam...     False   \n",
-       "4  {'angiotensin II': {'namespace': 'chebi', 'nam...     False   \n",
-       "5  {'Fibrin': {'namespace': 'chebi', 'name': 'Fib...     False   \n",
-       "6  {'Fibrin': {'namespace': 'chebi', 'name': 'Fib...     False   \n",
-       "7  {'chloroquine': {'namespace': 'chebi', 'name':...     False   \n",
-       "\n",
-       "                                              target  \\\n",
-       "0  {'Low-grade fever': {'namespace': 'hp', 'name'...   \n",
-       "1  {'influenza': {'namespace': 'doid', 'name': 'i...   \n",
-       "2  {'COVID-19': {'namespace': 'doid', 'name': 'CO...   \n",
-       "3  {'Ventricular Remodeling': {'namespace': 'mesh...   \n",
-       "4  {'Myocardial fibrosis': {'namespace': 'hp', 'n...   \n",
-       "5  {'Hyperfibrinolysis': {'namespace': 'hp', 'nam...   \n",
-       "6  {'Hemorrhage': {'namespace': 'mesh', 'name': '...   \n",
-       "7  {'dendritic cell antigen processing and presen...   \n",
-       "\n",
-       "                                                link      pmc_id doi_id  \n",
-       "0  {'annotations': {}, 'citation': {'authors': ['...  32166483.0    NaN  \n",
-       "1  {'annotations': {}, 'citation': {'authors': ['...  32166483.0    NaN  \n",
-       "2  {'annotations': {}, 'citation': {'authors': ['...  32129518.0    NaN  \n",
-       "3  {'annotations': {}, 'citation': {'authors': ['...  32221983.0    NaN  \n",
-       "4  {'annotations': {}, 'citation': {'authors': ['...  32221983.0    NaN  \n",
-       "5  {'annotations': {}, 'citation': {'authors': ['...  32216698.0    NaN  \n",
-       "6  {'annotations': {}, 'citation': {'authors': ['...  32216698.0    NaN  \n",
-       "7  {'annotations': {'mesh': {'D007251': True}}, '...  32171740.0    NaN  "
-      ]
-     },
-     "execution_count": 135,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "positive_relation_df = example_data[example_data['relation']==0]\n",
-    "positive_relation_df.reset_index(inplace=True,drop=True)\n",
-    "positive_relation_df.head(8)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 136,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'Adverse reactions of IFN-α mainly include low-grade fever and flu-like symptoms (both in children with intramuscularly injection) [11].'"
-      ]
-     },
-     "execution_count": 136,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "positive_relation_df['text'][0]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 2.3 Source-Target dictionary"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "A simple but clean rule for identifying negative correlation sentences would be if negative tokens occured in the words between the source and the target. So, a source-target dictonary is created for some of the examples (in the final pipeline the source target dictonary will be obtained from the spacy pipeline)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 137,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[('tetrandrine', 'TPC2'),\n",
-       " ('triiodothyronine', 'recovered'),\n",
-       " ('methylprednisolone', 'death'),\n",
-       " ('IFN-α', 'fever'),\n",
-       " ('angiotensin', 'vasodilator'),\n",
-       " ('ACE2', 'Ang'),\n",
-       " ('fibrin', 'COVID-19'),\n",
-       " ('hemorrhage', 'fibrinolysis'),\n",
-       " ('chloroquine', 'dendritic')]"
-      ]
-     },
-     "execution_count": 137,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "example_sources = ['tetrandrine','triiodothyronine','methylprednisolone','IFN-α','angiotensin','ACE2','fibrin','hemorrhage','chloroquine']\n",
-    "\n",
-    "example_targets = ['TPC2','recovered','death','fever','vasodilator','Ang','COVID-19','fibrinolysis','dendritic'] #low-grade fever\n",
-    "\n",
-    "example_source_target_dict = list(OrderedDict.fromkeys(zip(example_sources,example_targets)))\n",
-    "example_source_target_dict"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 2.4 Labeling functions for RE\n",
-    "\n",
-    "#### 2.4.1 Preprocessing"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 138,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "spacy = SpacyPreprocessor(text_field=\"text\", doc_field=\"doc\", memoize=True)\n",
-    "\n",
-    "ABSTAIN = -1\n",
-    "NOT_FOUND = 0\n",
-    "FOUND = 1"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 139,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Source target pair:  ('tetrandrine', 'TPC2')\n"
-     ]
-    }
-   ],
-   "source": [
-    "@preprocessor(pre=[spacy])\n",
-    "def get_source_target(cand: DataPoint) -> DataPoint:\n",
-    "    \"\"\"\n",
-    "    Returns the source and target mentioned in the sentence\n",
-    "    \"\"\"\n",
-    "    person_names = []\n",
-    "\n",
-    "    source = [token.text for token in cand.doc if token.text in example_sources]\n",
-    "    target = [token.text for token in cand.doc if token.text in example_targets]\n",
-    "    \n",
-    "    try:\n",
-    "        cand.source_target = (source[0],target[0])\n",
-    "    except:\n",
-    "        cand.source_target = (np.nan,np.nan)\n",
-    "    return cand\n",
-    "\n",
-    "########### function example #####################\n",
-    "\n",
-    "candidate = example_data.loc[0]\n",
-    "candidate_with_function_applied = get_source_target(candidate) \n",
-    "\n",
-    "print(\"Source target pair: \", candidate_with_function_applied.source_target)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 140,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Sentence:  Thyroid stimulating hormone and free triiodothyronine concentrations were significantly lower in deceased patients (0.7 mIU/mL and 2.8 pmol/L) than in recovered patients (1.4 mIU/mL and 4.3 pmol/L).\n",
-      "Text Between:  triiodothyronine concentrations were significantly lower in deceased patients (0.7 mIU/mL and 2.8 pmol/L) than in\n",
-      "Text Between:  Thyroid stimulating hormone and free\n"
-     ]
-    }
-   ],
-   "source": [
-    "@preprocessor(pre=[spacy])\n",
-    "def get_text_between(cand: DataPoint) -> DataPoint:\n",
-    "    \"\"\"\n",
-    "    Returns the text between a source target pair\n",
-    "    \"\"\"\n",
-    "    person_names = []\n",
-    "\n",
-    "    source_idx = [token.i for token in cand.doc if token.text in example_sources]\n",
-    "    target_idx = [token.i for token in cand.doc if token.text in example_targets]\n",
-    "    \n",
-    "    try:\n",
-    "\n",
-    "        if (len(target_idx)==1) & (len(source_idx)==1) & (source_idx[0]<target_idx[0]):\n",
-    "            cand.text_between = cand.doc[source_idx[0]:target_idx[0]]\n",
-    "            cand.text_to_source_left = cand.doc[:source_idx[0]]\n",
-    "            \n",
-    "        elif (len(target_idx)>1) & (len(source_idx)==1):\n",
-    "            for target_index in target_idx:\n",
-    "                if source_idx[0]<target_index:\n",
-    "                    cand.text_between = cand.doc[source_idx[0]:target_index]\n",
-    "                    cand.text_to_source_left = cand.doc[:source_idx[0]]\n",
-    "                    \n",
-    "        elif (len(source_idx)>1) & (len(target_idx)==1):\n",
-    "            for source_index in source_idx:\n",
-    "                if source_index<target_idx[0]:\n",
-    "                    cand.text_between = cand.doc[source_index:target_idx[0]]\n",
-    "                    cand.text_to_source_left = cand.doc[:source_index]\n",
-    "                    \n",
-    "        elif (len(source_idx)>1) & (len(target_idx)>1):\n",
-    "            for source_index in source_idx:\n",
-    "                for target_index in target_idx:\n",
-    "                    if source_index<target_index:\n",
-    "                        cand.text_between = cand.doc[source_index:target_index]\n",
-    "                        cand.text_to_source_left = cand.doc[:source_index]\n",
-    "                        \n",
-    "        else:\n",
-    "            cand.text_between = 'NaN'\n",
-    "            cand.text_to_source_left = 'NaN'\n",
-    "    except:\n",
-    "        \n",
-    "        cand.text_between = 'NaN'\n",
-    "        cand.text_to_source_left = 'NaN'\n",
-    "        \n",
-    "    return cand\n",
-    "\n",
-    "############ function example ###############################\n",
-    "candidate = example_data.loc[2]\n",
-    "\n",
-    "candidate_with_function_applied = get_text_between(candidate)\n",
-    "\n",
-    "print(\"Sentence: \", candidate[\"text\"])\n",
-    "print(\"Text Between: \", candidate_with_function_applied.text_between)\n",
-    "print(\"Text Between: \", candidate_with_function_applied.text_to_source_left)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### 2.4.2 Defining the labelling functions\n",
-    "\n",
-    "For the final labelling model to work, at least 3 rules are needed.\n",
-    "\n",
-    "##### A. Positive rules: rules which tell you what a sentences with negative correlation looks like\n",
-    "    \n",
-    "1) does the sentence contain the reduction related words like words 'decreased','reduced','lowered'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 141,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "reduction_tokens = {'decreased',\n",
-    "                            'lower',\n",
-    "                            'reduced',\n",
-    "}\n",
-    "\n",
-    "@labeling_function(pre=[spacy],resources=dict(reduction_tokens=reduction_tokens))\n",
-    "def contains_reduction_tokens(x,reduction_tokens):\n",
-    "    \n",
-    "    tokens = [str(token) for token in x.doc]\n",
-    "    return FOUND if len(reduction_tokens.intersection(set(tokens))) > 0 else ABSTAIN\n",
-    "\n",
-    "#positive rule - version 2\n",
-    "@labeling_function(pre=[spacy,get_text_between],resources=dict(reduction_tokens=reduction_tokens))\n",
-    "def contains_reduction_tokens_text_between(x,reduction_tokens):\n",
-    "    relation_text = x.text_between\n",
-    "    relation_text_tokens = [str(token) for token in relation_text]\n",
-    "    return FOUND if len(reduction_tokens.intersection(set(relation_text_tokens))) > 0 else ABSTAIN\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "2) does the sentence contain the expression 'negatively correlated' or 'negative correlation' or 'negatively related' or 'inversely related' or 'inverse relation'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 142,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "negative_correlation_regex_1 = 'negative correlation'\n",
-    "negative_correlation_regex_2 = 'negatively correlated'\n",
-    "negative_correlation_regex_3 = 'negatively related'\n",
-    "negative_correlation_regex_4 = 'inversely related'\n",
-    "negative_correlation_regex_5 = 'inverse relation'\n",
-    "negative_correlation_regex_6 = 'negative effect'\n",
-    "negative_correlation_regex_7 = 'move in opposite directions'\n",
-    "\n",
-    "@labeling_function()\n",
-    "def contains_negative_corrrelation_regex(x):\n",
-    "    if re.search(negative_correlation_regex_1, x.text, flags=re.I):\n",
-    "        return FOUND\n",
-    "    elif re.search(negative_correlation_regex_2, x.text, flags=re.I):\n",
-    "        return FOUND\n",
-    "    elif re.search(negative_correlation_regex_3, x.text, flags=re.I):\n",
-    "        return FOUND\n",
-    "    elif re.search(negative_correlation_regex_4, x.text, flags=re.I):\n",
-    "        return FOUND\n",
-    "    elif re.search(negative_correlation_regex_5, x.text, flags=re.I):\n",
-    "        return FOUND\n",
-    "    elif re.search(negative_correlation_regex_6, x.text, flags=re.I):\n",
-    "        return FOUND\n",
-    "    elif re.search(negative_correlation_regex_7, x.text, flags=re.I):\n",
-    "        return FOUND\n",
-    "\n",
-    "    else: \n",
-    "        return ABSTAIN\n",
-    "    "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "3) does the syntax follow the structure 'increase in x is associated with a decrease in y' (and vice versa)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 143,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "@labeling_function(pre=[spacy,get_text_between])\n",
-    "def contains_increase_decrease_pattern(x):\n",
-    "    if ('increase' in [str(token) for token in x.text_to_source_left]) & ('decrease' in [str(token) for token in x.text_between]):\n",
-    "        return FOUND\n",
-    "    elif ('decrease' in [str(token) for token in x.text_to_source_left]) & ('increase' in [str(token) for token in x.text_between]):\n",
-    "        return FOUND\n",
-    "    else:\n",
-    "        return ABSTAIN"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "##### B. Negative rules: rules which tell you what a sentences without negative correlation looks like\n",
-    "\n",
-    "1) does the sentence contain the increase related words like words 'increased','higher'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 144,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "increase_tokens = {'increased',\n",
-    "                            'higher',\n",
-    "}\n",
-    "\n",
-    "@labeling_function(pre=[spacy],resources=dict(increase_tokens=increase_tokens))\n",
-    "def contains_increase_tokens(x,increase_tokens):\n",
-    "    tokens = [str(token) for token in x.doc]\n",
-    "    return NOT_FOUND if len(increase_tokens.intersection(set(tokens))) > 0 else ABSTAIN\n",
-    "\n",
-    "\n",
-    "@labeling_function(pre=[spacy,get_text_between],resources=dict(increase_tokens=increase_tokens))\n",
-    "def contains_increase_tokens_text_between(x, increase_tokens):\n",
-    "    relation_text = x.text_between\n",
-    "    relation_text_tokens = [str(token) for token in relation_text]\n",
-    "    return NOT_FOUND if len(increase_tokens.intersection(set(relation_text_tokens))) > 0 else ABSTAIN"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "2) does the sentence contain the expression 'positively correlated' or 'positive correlation' or 'positively related' or 'positive effect' or 'move in the same direction'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 145,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#regex\n",
-    "positive_correlation_regex_1 = 'positive correlation'\n",
-    "positive_correlation_regex_2 = 'positively correlated'\n",
-    "positive_correlation_regex_3 = 'positively related'\n",
-    "positive_correlation_regex_4 = 'positive effect'\n",
-    "positive_correlation_regex_5 = 'move in the same direction'\n",
-    "\n",
-    "@labeling_function()\n",
-    "def contains_positive_corrrelation_regex(x):\n",
-    "    if re.search(positive_correlation_regex_1, x.text, flags=re.I):\n",
-    "        return FOUND\n",
-    "    elif re.search(positive_correlation_regex_2, x.text, flags=re.I):\n",
-    "        return FOUND\n",
-    "    elif re.search(positive_correlation_regex_3, x.text, flags=re.I):\n",
-    "        return FOUND\n",
-    "    elif re.search(positive_correlation_regex_4, x.text, flags=re.I):\n",
-    "        return FOUND\n",
-    "    elif re.search(positive_correlation_regex_5, x.text, flags=re.I):\n",
-    "        return FOUND    \n",
-    "    else:\n",
-    "        return ABSTAIN"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "3) does the syntax follow the structure 'increase in x is associated with a increase in y' (and vice versa)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 146,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "@labeling_function(pre=[spacy,get_text_between])\n",
-    "def contains_increase_increase_pattern(x):\n",
-    "    if ('increase' in [str(token) for token in x.text_to_source_left]) & ('increase' in [str(token) for token in x.text_between]):\n",
-    "        return FOUND\n",
-    "    elif ('decrease' in [str(token) for token in x.text_to_source_left]) & ('decrease' in [str(token) for token in x.text_between]):\n",
-    "        return FOUND\n",
-    "    else:\n",
-    "        return ABSTAIN"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 2.5 Creating all the labels for the different rules"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 147,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "C:\\Users\\Cafral\\Anaconda3\\lib\\site-packages\\tqdm\\std.py:658: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n",
-      "  from pandas import Panel\n",
-      "\n",
-      "  0%|                                                                                         | 0/2227 [00:00<?, ?it/s]\u001b[A\n",
-      "  0%|▏                                                                                | 4/2227 [00:00<01:02, 35.32it/s]\u001b[A\n",
-      "  0%|▎                                                                                | 9/2227 [00:00<00:59, 36.97it/s]\u001b[A\n",
-      "  1%|▌                                                                               | 16/2227 [00:00<00:51, 43.02it/s]\u001b[A\n",
-      "  1%|▋                                                                               | 20/2227 [00:00<00:54, 40.82it/s]\u001b[A\n",
-      "  1%|▊                                                                               | 24/2227 [00:00<00:54, 40.45it/s]\u001b[A\n",
-      "  1%|█                                                                               | 29/2227 [00:00<00:51, 42.66it/s]\u001b[A\n",
-      "  2%|█▎                                                                              | 35/2227 [00:00<00:49, 44.55it/s]\u001b[A\n",
-      "  2%|█▍                                                                              | 40/2227 [00:00<00:50, 42.99it/s]\u001b[A\n",
-      "  2%|█▌                                                                              | 45/2227 [00:01<00:54, 39.87it/s]\u001b[A\n",
-      "  3%|██▏                                                                             | 61/2227 [00:01<00:42, 51.21it/s]\u001b[A\n",
-      "  3%|██▍                                                                             | 69/2227 [00:01<00:40, 53.77it/s]\u001b[A\n",
-      "  3%|██▊                                                                             | 77/2227 [00:01<00:36, 58.84it/s]\u001b[A\n",
-      "  4%|███▏                                                                            | 88/2227 [00:01<00:31, 67.95it/s]\u001b[A\n",
-      "  4%|███▍                                                                            | 97/2227 [00:01<00:34, 62.02it/s]\u001b[A\n",
-      "  5%|████                                                                           | 113/2227 [00:01<00:28, 73.29it/s]\u001b[A\n",
-      "  6%|████▋                                                                          | 132/2227 [00:01<00:23, 89.58it/s]\u001b[A\n",
-      "  7%|█████▎                                                                        | 153/2227 [00:01<00:19, 108.20it/s]\u001b[A\n",
-      "  8%|██████▎                                                                       | 181/2227 [00:02<00:15, 132.17it/s]\u001b[A\n",
-      "  9%|███████                                                                       | 200/2227 [00:02<00:14, 139.37it/s]\u001b[A\n",
-      " 10%|███████▋                                                                      | 218/2227 [00:02<00:16, 125.24it/s]\u001b[A\n",
-      " 11%|████████▎                                                                      | 234/2227 [00:02<00:24, 82.06it/s]\u001b[A\n",
-      " 11%|████████▊                                                                      | 247/2227 [00:02<00:26, 73.44it/s]\u001b[A\n",
-      " 12%|█████████▏                                                                     | 258/2227 [00:03<00:26, 73.62it/s]\u001b[A\n",
-      " 12%|█████████▌                                                                     | 268/2227 [00:03<00:33, 58.67it/s]\u001b[A\n",
-      " 12%|█████████▊                                                                     | 276/2227 [00:03<00:44, 44.08it/s]\u001b[A\n",
-      " 13%|██████████                                                                     | 283/2227 [00:03<00:45, 42.30it/s]\u001b[A\n",
-      " 13%|██████████▎                                                                    | 289/2227 [00:03<00:42, 45.53it/s]\u001b[A\n",
-      " 13%|██████████▌                                                                    | 299/2227 [00:04<00:35, 54.16it/s]\u001b[A\n",
-      " 14%|██████████▉                                                                    | 307/2227 [00:04<00:34, 55.86it/s]\u001b[A\n",
-      " 14%|███████████▏                                                                   | 314/2227 [00:04<00:38, 49.07it/s]\u001b[A\n",
-      " 14%|███████████▍                                                                   | 322/2227 [00:04<00:34, 55.50it/s]\u001b[A\n",
-      " 15%|███████████▋                                                                   | 329/2227 [00:04<00:34, 54.91it/s]\u001b[A\n",
-      " 15%|███████████▉                                                                   | 336/2227 [00:04<00:34, 54.86it/s]\u001b[A\n",
-      " 16%|████████████▎                                                                  | 348/2227 [00:04<00:28, 65.38it/s]\u001b[A\n",
-      " 16%|████████████▉                                                                  | 364/2227 [00:04<00:24, 75.68it/s]\u001b[A\n",
-      " 17%|█████████████▌                                                                 | 381/2227 [00:05<00:20, 90.75it/s]\u001b[A\n",
-      " 18%|█████████████▉                                                                 | 393/2227 [00:05<00:20, 88.80it/s]\u001b[A\n",
-      " 19%|██████████████▋                                                               | 419/2227 [00:05<00:16, 110.40it/s]\u001b[A\n",
-      " 20%|███████████████▍                                                              | 441/2227 [00:05<00:13, 129.33it/s]\u001b[A\n",
-      " 21%|████████████████                                                              | 459/2227 [00:05<00:13, 131.51it/s]\u001b[A\n",
-      " 21%|████████████████▋                                                             | 476/2227 [00:05<00:12, 139.84it/s]\u001b[A\n",
-      " 22%|█████████████████▎                                                            | 493/2227 [00:05<00:15, 112.09it/s]\u001b[A\n",
-      " 23%|█████████████████▊                                                            | 507/2227 [00:05<00:16, 106.92it/s]\u001b[A\n",
-      " 24%|██████████████████▎                                                           | 524/2227 [00:06<00:14, 120.04it/s]\u001b[A\n",
-      " 24%|███████████████████                                                           | 545/2227 [00:06<00:12, 136.86it/s]\u001b[A\n",
-      " 25%|███████████████████▊                                                          | 565/2227 [00:06<00:11, 149.93it/s]\u001b[A\n",
-      " 26%|████████████████████▍                                                         | 583/2227 [00:06<00:10, 157.50it/s]\u001b[A\n",
-      " 27%|█████████████████████                                                         | 601/2227 [00:06<00:10, 156.92it/s]\u001b[A\n",
-      " 28%|█████████████████████▋                                                        | 619/2227 [00:06<00:09, 162.40it/s]\u001b[A\n",
-      " 29%|██████████████████████▌                                                       | 644/2227 [00:06<00:08, 181.28it/s]\u001b[A\n",
-      " 30%|███████████████████████▎                                                      | 665/2227 [00:06<00:08, 188.40it/s]\u001b[A\n",
-      " 31%|███████████████████████▉                                                      | 685/2227 [00:06<00:08, 184.86it/s]\u001b[A\n",
-      " 32%|████████████████████████▋                                                     | 705/2227 [00:07<00:08, 183.00it/s]\u001b[A\n",
-      " 33%|█████████████████████████▍                                                    | 725/2227 [00:07<00:08, 187.58it/s]\u001b[A\n",
-      " 34%|██████████████████████████▏                                                   | 748/2227 [00:07<00:07, 192.09it/s]\u001b[A\n",
-      " 34%|██████████████████████████▉                                                   | 768/2227 [00:07<00:08, 180.88it/s]\u001b[A\n",
-      " 35%|███████████████████████████▌                                                  | 787/2227 [00:07<00:08, 176.97it/s]\u001b[A\n",
-      " 36%|████████████████████████████▏                                                 | 805/2227 [00:07<00:10, 136.68it/s]\u001b[A\n",
-      " 37%|████████████████████████████▊                                                 | 821/2227 [00:07<00:12, 117.14it/s]\u001b[A\n",
-      " 37%|█████████████████████████████▌                                                 | 835/2227 [00:08<00:16, 82.38it/s]\u001b[A\n",
-      " 38%|██████████████████████████████                                                 | 846/2227 [00:08<00:21, 63.65it/s]\u001b[A\n",
-      " 38%|██████████████████████████████▎                                                | 855/2227 [00:08<00:25, 53.53it/s]\u001b[A\n",
-      " 39%|██████████████████████████████▌                                                | 863/2227 [00:08<00:25, 53.50it/s]\u001b[A\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      " 39%|██████████████████████████████▊                                                | 870/2227 [00:08<00:27, 49.89it/s]\u001b[A\n",
-      " 40%|███████████████████████████████▍                                               | 887/2227 [00:09<00:21, 63.09it/s]\u001b[A\n",
-      " 41%|███████████████████████████████▉                                               | 902/2227 [00:09<00:17, 75.94it/s]\u001b[A\n",
-      " 41%|████████████████████████████████▍                                              | 913/2227 [00:09<00:21, 62.34it/s]\u001b[A\n",
-      " 41%|████████████████████████████████▋                                              | 922/2227 [00:09<00:21, 60.38it/s]\u001b[A\n",
-      " 42%|████████████████████████████████▉                                              | 930/2227 [00:09<00:22, 58.34it/s]\u001b[A\n",
-      " 42%|█████████████████████████████████▎                                             | 939/2227 [00:09<00:20, 63.84it/s]\u001b[A\n",
-      " 43%|█████████████████████████████████▋                                             | 951/2227 [00:09<00:17, 73.07it/s]\u001b[A\n",
-      " 43%|██████████████████████████████████                                             | 961/2227 [00:10<00:17, 70.72it/s]\u001b[A\n",
-      " 44%|██████████████████████████████████▎                                            | 969/2227 [00:10<00:22, 56.28it/s]\u001b[A\n",
-      " 44%|██████████████████████████████████▌                                            | 976/2227 [00:10<00:23, 54.06it/s]\u001b[A\n",
-      " 44%|██████████████████████████████████▊                                            | 983/2227 [00:10<00:23, 53.22it/s]\u001b[A\n",
-      " 44%|███████████████████████████████████                                            | 989/2227 [00:10<00:26, 46.14it/s]\u001b[A\n",
-      " 45%|███████████████████████████████████▎                                           | 995/2227 [00:10<00:25, 49.12it/s]\u001b[A\n",
-      " 45%|███████████████████████████████████                                           | 1001/2227 [00:11<00:26, 45.93it/s]\u001b[A\n",
-      " 45%|███████████████████████████████████▍                                          | 1012/2227 [00:11<00:22, 54.76it/s]\u001b[A\n",
-      " 46%|███████████████████████████████████▋                                          | 1019/2227 [00:11<00:23, 52.42it/s]\u001b[A\n",
-      " 46%|████████████████████████████████████▎                                         | 1035/2227 [00:11<00:18, 65.65it/s]\u001b[A\n",
-      " 47%|████████████████████████████████████▌                                         | 1044/2227 [00:11<00:17, 68.59it/s]\u001b[A\n",
-      " 47%|████████████████████████████████████▉                                         | 1053/2227 [00:11<00:20, 58.18it/s]\u001b[A\n",
-      " 48%|█████████████████████████████████████▏                                        | 1061/2227 [00:11<00:21, 54.12it/s]\u001b[A\n",
-      " 48%|█████████████████████████████████████▌                                        | 1073/2227 [00:12<00:18, 63.03it/s]\u001b[A\n",
-      " 49%|█████████████████████████████████████▊                                        | 1081/2227 [00:12<00:21, 52.77it/s]\u001b[A\n",
-      " 49%|██████████████████████████████████████▏                                       | 1090/2227 [00:12<00:19, 58.99it/s]\u001b[A\n",
-      " 49%|██████████████████████████████████████▌                                       | 1102/2227 [00:12<00:16, 69.14it/s]\u001b[A\n",
-      " 50%|██████████████████████████████████████▉                                       | 1112/2227 [00:12<00:15, 72.89it/s]\u001b[A\n",
-      " 50%|███████████████████████████████████████▎                                      | 1121/2227 [00:12<00:14, 75.90it/s]\u001b[A\n",
-      " 51%|███████████████████████████████████████▊                                      | 1138/2227 [00:12<00:12, 90.66it/s]\u001b[A\n",
-      " 52%|████████████████████████████████████████▎                                     | 1151/2227 [00:12<00:10, 99.05it/s]\u001b[A\n",
-      " 52%|████████████████████████████████████████▋                                     | 1163/2227 [00:12<00:10, 98.04it/s]\u001b[A\n",
-      " 53%|████████████████████████████████████████▊                                    | 1180/2227 [00:13<00:09, 111.73it/s]\u001b[A\n",
-      " 54%|█████████████████████████████████████████▏                                   | 1193/2227 [00:13<00:09, 112.70it/s]\u001b[A\n",
-      " 54%|█████████████████████████████████████████▋                                   | 1207/2227 [00:13<00:08, 115.89it/s]\u001b[A\n",
-      " 55%|██████████████████████████████████████████▎                                  | 1224/2227 [00:13<00:07, 128.12it/s]\u001b[A\n",
-      " 56%|██████████████████████████████████████████▊                                  | 1239/2227 [00:13<00:07, 132.43it/s]\u001b[A\n",
-      " 56%|███████████████████████████████████████████▎                                 | 1253/2227 [00:13<00:09, 100.50it/s]\u001b[A\n",
-      " 57%|████████████████████████████████████████████▎                                 | 1265/2227 [00:13<00:09, 97.93it/s]\u001b[A\n",
-      " 57%|████████████████████████████████████████████▏                                | 1277/2227 [00:13<00:09, 102.33it/s]\u001b[A\n",
-      " 58%|█████████████████████████████████████████████▏                                | 1289/2227 [00:14<00:12, 77.26it/s]\u001b[A\n",
-      " 58%|█████████████████████████████████████████████▍                                | 1299/2227 [00:14<00:13, 69.14it/s]\u001b[A\n",
-      " 59%|█████████████████████████████████████████████▊                                | 1309/2227 [00:14<00:12, 75.56it/s]\u001b[A\n",
-      " 60%|██████████████████████████████████████████████▌                               | 1330/2227 [00:14<00:09, 93.25it/s]\u001b[A\n",
-      " 60%|███████████████████████████████████████████████                               | 1343/2227 [00:14<00:08, 98.27it/s]\u001b[A\n",
-      " 61%|███████████████████████████████████████████████▏                             | 1363/2227 [00:14<00:07, 114.93it/s]\u001b[A\n",
-      " 62%|███████████████████████████████████████████████▋                             | 1379/2227 [00:14<00:06, 125.14it/s]\u001b[A\n",
-      " 63%|████████████████████████████████████████████████▎                            | 1396/2227 [00:15<00:06, 133.88it/s]\u001b[A\n",
-      " 63%|█████████████████████████████████████████████████▍                            | 1411/2227 [00:15<00:12, 64.64it/s]\u001b[A\n",
-      " 64%|█████████████████████████████████████████████████▊                            | 1423/2227 [00:15<00:14, 54.47it/s]\u001b[A\n",
-      " 65%|██████████████████████████████████████████████████▌                           | 1442/2227 [00:15<00:11, 69.27it/s]\u001b[A\n",
-      " 66%|███████████████████████████████████████████████████                           | 1459/2227 [00:16<00:09, 82.11it/s]\u001b[A\n",
-      " 66%|███████████████████████████████████████████████████▌                          | 1472/2227 [00:16<00:08, 90.32it/s]\u001b[A\n",
-      " 67%|████████████████████████████████████████████████████                          | 1485/2227 [00:16<00:07, 96.08it/s]\u001b[A\n",
-      " 67%|███████████████████████████████████████████████████▊                         | 1498/2227 [00:16<00:07, 100.63it/s]\u001b[A\n",
-      " 68%|████████████████████████████████████████████████████▎                        | 1513/2227 [00:16<00:06, 111.34it/s]\u001b[A\n",
-      " 69%|█████████████████████████████████████████████████████                        | 1533/2227 [00:16<00:05, 127.75it/s]\u001b[A\n",
-      " 70%|██████████████████████████████████████████████████████▏                       | 1548/2227 [00:16<00:06, 99.87it/s]\u001b[A\n",
-      " 70%|██████████████████████████████████████████████████████▋                       | 1561/2227 [00:17<00:09, 70.40it/s]\u001b[A\n",
-      " 71%|███████████████████████████████████████████████████████                       | 1571/2227 [00:17<00:09, 69.84it/s]\u001b[A\n",
-      " 71%|███████████████████████████████████████████████████████▍                      | 1584/2227 [00:17<00:08, 78.10it/s]\u001b[A\n",
-      " 72%|███████████████████████████████████████████████████████▊                      | 1595/2227 [00:17<00:07, 83.84it/s]\u001b[A\n",
-      " 72%|████████████████████████████████████████████████████████▏                     | 1605/2227 [00:17<00:08, 75.55it/s]\u001b[A\n",
-      " 72%|████████████████████████████████████████████████████████▌                     | 1614/2227 [00:17<00:07, 77.72it/s]\u001b[A\n",
-      " 73%|████████████████████████████████████████████████████████▉                     | 1624/2227 [00:17<00:07, 82.39it/s]\u001b[A\n",
-      " 74%|█████████████████████████████████████████████████████████▋                    | 1647/2227 [00:18<00:05, 99.88it/s]\u001b[A\n",
-      " 75%|█████████████████████████████████████████████████████████▍                   | 1660/2227 [00:18<00:05, 103.38it/s]\u001b[A\n",
-      " 75%|██████████████████████████████████████████████████████████▌                   | 1672/2227 [00:18<00:05, 99.51it/s]\u001b[A\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      " 76%|██████████████████████████████████████████████████████████▏                  | 1684/2227 [00:18<00:05, 102.91it/s]\u001b[A\n",
-      " 76%|██████████████████████████████████████████████████████████▋                  | 1697/2227 [00:18<00:04, 109.63it/s]\u001b[A\n",
-      " 77%|███████████████████████████████████████████████████████████▏                 | 1712/2227 [00:18<00:04, 119.03it/s]\u001b[A\n",
-      " 78%|███████████████████████████████████████████████████████████▉                 | 1733/2227 [00:18<00:03, 136.08it/s]\u001b[A\n",
-      " 79%|████████████████████████████████████████████████████████████▋                | 1757/2227 [00:18<00:03, 155.76it/s]\u001b[A\n",
-      " 80%|█████████████████████████████████████████████████████████████▋               | 1783/2227 [00:18<00:02, 176.00it/s]\u001b[A\n",
-      " 81%|██████████████████████████████████████████████████████████████▎              | 1803/2227 [00:19<00:02, 144.49it/s]\u001b[A\n",
-      " 82%|██████████████████████████████████████████████████████████████▉              | 1820/2227 [00:19<00:03, 134.51it/s]\u001b[A\n",
-      " 83%|███████████████████████████████████████████████████████████████▌             | 1840/2227 [00:19<00:02, 148.72it/s]\u001b[A\n",
-      " 83%|████████████████████████████████████████████████████████████████▎            | 1859/2227 [00:19<00:02, 158.07it/s]\u001b[A\n",
-      " 85%|█████████████████████████████████████████████████████████████████            | 1883/2227 [00:19<00:01, 174.77it/s]\u001b[A\n",
-      " 85%|█████████████████████████████████████████████████████████████████▊           | 1903/2227 [00:19<00:02, 133.57it/s]\u001b[A\n",
-      " 86%|██████████████████████████████████████████████████████████████████▎          | 1919/2227 [00:19<00:02, 128.47it/s]\u001b[A\n",
-      " 87%|██████████████████████████████████████████████████████████████████▊          | 1934/2227 [00:20<00:02, 127.47it/s]\u001b[A\n",
-      " 88%|███████████████████████████████████████████████████████████████████▍         | 1949/2227 [00:20<00:02, 126.35it/s]\u001b[A\n",
-      " 88%|████████████████████████████████████████████████████████████████████         | 1969/2227 [00:20<00:01, 141.48it/s]\u001b[A\n",
-      " 89%|████████████████████████████████████████████████████████████████████▊        | 1992/2227 [00:20<00:01, 159.94it/s]\u001b[A\n",
-      " 90%|█████████████████████████████████████████████████████████████████████▌       | 2013/2227 [00:20<00:01, 171.16it/s]\u001b[A\n",
-      " 92%|██████████████████████████████████████████████████████████████████████▍      | 2039/2227 [00:20<00:00, 190.33it/s]\u001b[A\n",
-      " 93%|███████████████████████████████████████████████████████████████████████▏     | 2060/2227 [00:20<00:00, 194.50it/s]\u001b[A\n",
-      " 93%|███████████████████████████████████████████████████████████████████████▉     | 2081/2227 [00:20<00:00, 187.83it/s]\u001b[A\n",
-      " 94%|████████████████████████████████████████████████████████████████████████▋    | 2101/2227 [00:20<00:00, 177.27it/s]\u001b[A\n",
-      " 95%|█████████████████████████████████████████████████████████████████████████▎   | 2120/2227 [00:20<00:00, 176.31it/s]\u001b[A\n",
-      " 96%|██████████████████████████████████████████████████████████████████████████   | 2142/2227 [00:21<00:00, 185.80it/s]\u001b[A\n",
-      " 97%|██████████████████████████████████████████████████████████████████████████▊  | 2162/2227 [00:21<00:00, 177.11it/s]\u001b[A\n",
-      " 98%|███████████████████████████████████████████████████████████████████████████▌ | 2186/2227 [00:21<00:00, 191.89it/s]\u001b[A\n",
-      "100%|█████████████████████████████████████████████████████████████████████████████| 2227/2227 [00:21<00:00, 103.56it/s]\u001b[A\n"
-     ]
-    }
-   ],
-   "source": [
-    "label_functions_list = [contains_reduction_tokens,\n",
-    "                        contains_reduction_tokens_text_between,\n",
-    "                        contains_negative_corrrelation_regex,\n",
-    "                        contains_increase_decrease_pattern,\n",
-    "                        contains_increase_tokens,\n",
-    "                        contains_increase_tokens_text_between,\n",
-    "                        contains_positive_corrrelation_regex,\n",
-    "                        contains_increase_increase_pattern\n",
-    "                       ]\n",
-    "\n",
-    "applier = PandasLFApplier(label_functions_list)\n",
-    "\n",
-    "label_matrix_train = applier.apply(df_train)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 2.6 Examining the quality of the labels"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 148,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>j</th>\n",
-       "      <th>Polarity</th>\n",
-       "      <th>Coverage</th>\n",
-       "      <th>Overlaps</th>\n",
-       "      <th>Conflicts</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <td>contains_reduction_tokens</td>\n",
-       "      <td>0</td>\n",
-       "      <td>[1]</td>\n",
-       "      <td>0.057027</td>\n",
-       "      <td>0.019308</td>\n",
-       "      <td>0.016165</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>contains_reduction_tokens_text_between</td>\n",
-       "      <td>1</td>\n",
-       "      <td>[1]</td>\n",
-       "      <td>0.004041</td>\n",
-       "      <td>0.004041</td>\n",
-       "      <td>0.000898</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>contains_negative_corrrelation_regex</td>\n",
-       "      <td>2</td>\n",
-       "      <td>[1]</td>\n",
-       "      <td>0.008532</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>contains_increase_decrease_pattern</td>\n",
-       "      <td>3</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>contains_increase_tokens</td>\n",
-       "      <td>4</td>\n",
-       "      <td>[0]</td>\n",
-       "      <td>0.130220</td>\n",
-       "      <td>0.016165</td>\n",
-       "      <td>0.016165</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>contains_increase_tokens_text_between</td>\n",
-       "      <td>5</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>contains_positive_corrrelation_regex</td>\n",
-       "      <td>6</td>\n",
-       "      <td>[1]</td>\n",
-       "      <td>0.002245</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>contains_increase_increase_pattern</td>\n",
-       "      <td>7</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.000000</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                        j Polarity  Coverage  Overlaps  \\\n",
-       "contains_reduction_tokens               0      [1]  0.057027  0.019308   \n",
-       "contains_reduction_tokens_text_between  1      [1]  0.004041  0.004041   \n",
-       "contains_negative_corrrelation_regex    2      [1]  0.008532  0.000000   \n",
-       "contains_increase_decrease_pattern      3       []  0.000000  0.000000   \n",
-       "contains_increase_tokens                4      [0]  0.130220  0.016165   \n",
-       "contains_increase_tokens_text_between   5       []  0.000000  0.000000   \n",
-       "contains_positive_corrrelation_regex    6      [1]  0.002245  0.000000   \n",
-       "contains_increase_increase_pattern      7       []  0.000000  0.000000   \n",
-       "\n",
-       "                                        Conflicts  \n",
-       "contains_reduction_tokens                0.016165  \n",
-       "contains_reduction_tokens_text_between   0.000898  \n",
-       "contains_negative_corrrelation_regex     0.000000  \n",
-       "contains_increase_decrease_pattern       0.000000  \n",
-       "contains_increase_tokens                 0.016165  \n",
-       "contains_increase_tokens_text_between    0.000000  \n",
-       "contains_positive_corrrelation_regex     0.000000  \n",
-       "contains_increase_increase_pattern       0.000000  "
-      ]
-     },
-     "execution_count": 148,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "#in the absence of a benchmark to compare against\n",
-    "LFAnalysis(L=label_matrix_train, lfs=label_functions_list).lf_summary()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 149,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>j</th>\n",
-       "      <th>Polarity</th>\n",
-       "      <th>Coverage</th>\n",
-       "      <th>Overlaps</th>\n",
-       "      <th>Conflicts</th>\n",
-       "      <th>Correct</th>\n",
-       "      <th>Incorrect</th>\n",
-       "      <th>Emp. Acc.</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <td>contains_reduction_tokens</td>\n",
-       "      <td>0</td>\n",
-       "      <td>[1]</td>\n",
-       "      <td>0.057027</td>\n",
-       "      <td>0.019308</td>\n",
-       "      <td>0.016165</td>\n",
-       "      <td>69</td>\n",
-       "      <td>58</td>\n",
-       "      <td>0.543307</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>contains_reduction_tokens_text_between</td>\n",
-       "      <td>1</td>\n",
-       "      <td>[1]</td>\n",
-       "      <td>0.004041</td>\n",
-       "      <td>0.004041</td>\n",
-       "      <td>0.000898</td>\n",
-       "      <td>8</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.888889</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>contains_negative_corrrelation_regex</td>\n",
-       "      <td>2</td>\n",
-       "      <td>[1]</td>\n",
-       "      <td>0.008532</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>18</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.947368</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>contains_increase_decrease_pattern</td>\n",
-       "      <td>3</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>contains_increase_tokens</td>\n",
-       "      <td>4</td>\n",
-       "      <td>[0]</td>\n",
-       "      <td>0.130220</td>\n",
-       "      <td>0.016165</td>\n",
-       "      <td>0.016165</td>\n",
-       "      <td>263</td>\n",
-       "      <td>27</td>\n",
-       "      <td>0.906897</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>contains_increase_tokens_text_between</td>\n",
-       "      <td>5</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>contains_positive_corrrelation_regex</td>\n",
-       "      <td>6</td>\n",
-       "      <td>[1]</td>\n",
-       "      <td>0.002245</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0</td>\n",
-       "      <td>5</td>\n",
-       "      <td>0.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>contains_increase_increase_pattern</td>\n",
-       "      <td>7</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000000</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                        j Polarity  Coverage  Overlaps  \\\n",
-       "contains_reduction_tokens               0      [1]  0.057027  0.019308   \n",
-       "contains_reduction_tokens_text_between  1      [1]  0.004041  0.004041   \n",
-       "contains_negative_corrrelation_regex    2      [1]  0.008532  0.000000   \n",
-       "contains_increase_decrease_pattern      3       []  0.000000  0.000000   \n",
-       "contains_increase_tokens                4      [0]  0.130220  0.016165   \n",
-       "contains_increase_tokens_text_between   5       []  0.000000  0.000000   \n",
-       "contains_positive_corrrelation_regex    6      [1]  0.002245  0.000000   \n",
-       "contains_increase_increase_pattern      7       []  0.000000  0.000000   \n",
-       "\n",
-       "                                        Conflicts  Correct  Incorrect  \\\n",
-       "contains_reduction_tokens                0.016165       69         58   \n",
-       "contains_reduction_tokens_text_between   0.000898        8          1   \n",
-       "contains_negative_corrrelation_regex     0.000000       18          1   \n",
-       "contains_increase_decrease_pattern       0.000000        0          0   \n",
-       "contains_increase_tokens                 0.016165      263         27   \n",
-       "contains_increase_tokens_text_between    0.000000        0          0   \n",
-       "contains_positive_corrrelation_regex     0.000000        0          5   \n",
-       "contains_increase_increase_pattern       0.000000        0          0   \n",
-       "\n",
-       "                                        Emp. Acc.  \n",
-       "contains_reduction_tokens                0.543307  \n",
-       "contains_reduction_tokens_text_between   0.888889  \n",
-       "contains_negative_corrrelation_regex     0.947368  \n",
-       "contains_increase_decrease_pattern       0.000000  \n",
-       "contains_increase_tokens                 0.906897  \n",
-       "contains_increase_tokens_text_between    0.000000  \n",
-       "contains_positive_corrrelation_regex     0.000000  \n",
-       "contains_increase_increase_pattern       0.000000  "
-      ]
-     },
-     "execution_count": 149,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "#examining the quality of the labels in the presence of a benchmark to compare against\n",
-    "LFAnalysis(L=label_matrix_train, lfs=label_functions_list).lf_summary(y_train.values)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 150,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>text</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <td>0</td>\n",
-       "      <td>While blocking TPC2 activity by tetrandrine, a...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>2</td>\n",
-       "      <td>Thyroid stimulating hormone and free triiodoth...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>3</td>\n",
-       "      <td>The administration of methylprednisolone appea...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>7</td>\n",
-       "      <td>Consistent with previous reports, 20mM NH4Cl a...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>12</td>\n",
-       "      <td>Consistent with previous reports, 20mM NH4Cl a...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>1644</td>\n",
-       "      <td>Actual bicarbonate and total carbon dioxide co...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>1655</td>\n",
-       "      <td>Albumin concentrations were significantly lowe...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>1657</td>\n",
-       "      <td>Moreover, the frequencies of regulatory T cell...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>1658</td>\n",
-       "      <td>The reduced expressions of interferon-γ (IFN-γ...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <td>1668</td>\n",
-       "      <td>Spleen atrophy was observed in all reported ca...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>127 rows × 1 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                                   text\n",
-       "0     While blocking TPC2 activity by tetrandrine, a...\n",
-       "2     Thyroid stimulating hormone and free triiodoth...\n",
-       "3     The administration of methylprednisolone appea...\n",
-       "7     Consistent with previous reports, 20mM NH4Cl a...\n",
-       "12    Consistent with previous reports, 20mM NH4Cl a...\n",
-       "...                                                 ...\n",
-       "1644  Actual bicarbonate and total carbon dioxide co...\n",
-       "1655  Albumin concentrations were significantly lowe...\n",
-       "1657  Moreover, the frequencies of regulatory T cell...\n",
-       "1658  The reduced expressions of interferon-γ (IFN-γ...\n",
-       "1668  Spleen atrophy was observed in all reported ca...\n",
-       "\n",
-       "[127 rows x 1 columns]"
-      ]
-     },
-     "execution_count": 150,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "#examine which sentences were picked up as showing negative correlation by each label function\n",
-    "df_train.iloc[label_matrix_train[:, 0] == FOUND]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 2.7 Predict the final label\n",
-    "\n",
-    "Different models can be used to create the final model that aggrateges the different label functions to perdict the final lebel."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 151,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "C:\\Users\\Cafral\\Anaconda3\\lib\\site-packages\\tqdm\\std.py:658: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n",
-      "  from pandas import Panel\n",
-      "\n",
-      "  0%|                                                                                          | 0/557 [00:00<?, ?it/s]\u001b[A\n",
-      "  5%|███▉                                                                            | 27/557 [00:00<00:01, 266.68it/s]\u001b[A\n",
-      "  7%|█████▌                                                                          | 39/557 [00:00<00:02, 192.85it/s]\u001b[A\n",
-      "  9%|███████▍                                                                        | 52/557 [00:00<00:03, 166.70it/s]\u001b[A\n",
-      " 12%|█████████▍                                                                      | 66/557 [00:00<00:03, 156.09it/s]\u001b[A\n",
-      " 14%|███████████▎                                                                    | 79/557 [00:00<00:03, 138.04it/s]\u001b[A\n",
-      " 16%|█████████████                                                                   | 91/557 [00:00<00:03, 130.99it/s]\u001b[A\n",
-      " 19%|███████████████▏                                                               | 107/557 [00:00<00:03, 137.18it/s]\u001b[A\n",
-      " 22%|█████████████████▌                                                             | 124/557 [00:00<00:03, 144.01it/s]\u001b[A\n",
-      " 26%|████████████████████▎                                                          | 143/557 [00:00<00:02, 154.01it/s]\u001b[A\n",
-      " 29%|██████████████████████▌                                                        | 159/557 [00:01<00:02, 139.24it/s]\u001b[A\n",
-      " 31%|████████████████████████▌                                                      | 173/557 [00:01<00:02, 135.02it/s]\u001b[A\n",
-      " 34%|██████████████████████████▌                                                    | 187/557 [00:01<00:02, 130.39it/s]\u001b[A\n",
-      " 36%|████████████████████████████▌                                                  | 201/557 [00:01<00:02, 129.75it/s]\u001b[A\n",
-      " 39%|███████████████████████████████▏                                               | 220/557 [00:01<00:02, 142.39it/s]\u001b[A\n",
-      " 43%|██████████████████████████████████▏                                            | 241/557 [00:01<00:02, 152.56it/s]\u001b[A\n",
-      " 46%|████████████████████████████████████▍                                          | 257/557 [00:01<00:02, 145.73it/s]\u001b[A\n",
-      " 49%|██████████████████████████████████████▌                                        | 272/557 [00:01<00:02, 133.97it/s]\u001b[A\n",
-      " 51%|████████████████████████████████████████▌                                      | 286/557 [00:02<00:02, 126.53it/s]\u001b[A\n",
-      " 54%|██████████████████████████████████████████▋                                    | 301/557 [00:02<00:01, 132.39it/s]\u001b[A\n",
-      " 57%|█████████████████████████████████████████████▏                                 | 319/557 [00:02<00:01, 142.70it/s]\u001b[A\n",
-      " 61%|████████████████████████████████████████████████                               | 339/557 [00:02<00:01, 155.15it/s]\u001b[A\n",
-      " 64%|██████████████████████████████████████████████████▍                            | 356/557 [00:02<00:01, 153.73it/s]\u001b[A\n",
-      " 67%|████████████████████████████████████████████████████▊                          | 372/557 [00:02<00:01, 155.09it/s]\u001b[A\n",
-      " 70%|███████████████████████████████████████████████████████▏                       | 389/557 [00:02<00:01, 158.37it/s]\u001b[A\n",
-      " 73%|█████████████████████████████████████████████████████████▌                     | 406/557 [00:02<00:00, 159.67it/s]\u001b[A\n",
-      " 76%|███████████████████████████████████████████████████████████▉                   | 423/557 [00:02<00:00, 158.73it/s]\u001b[A\n",
-      " 79%|██████████████████████████████████████████████████████████████▍                | 440/557 [00:02<00:00, 159.66it/s]\u001b[A\n",
-      " 82%|████████████████████████████████████████████████████████████████▊              | 457/557 [00:03<00:00, 155.04it/s]\u001b[A\n",
-      " 87%|████████████████████████████████████████████████████████████████████▎          | 482/557 [00:03<00:00, 174.46it/s]\u001b[A\n",
-      " 90%|███████████████████████████████████████████████████████████████████████        | 501/557 [00:03<00:00, 169.50it/s]\u001b[A\n",
-      " 93%|█████████████████████████████████████████████████████████████████████████▌     | 519/557 [00:03<00:00, 151.16it/s]\u001b[A\n",
-      "100%|███████████████████████████████████████████████████████████████████████████████| 557/557 [00:03<00:00, 152.95it/s]\u001b[A\n"
-     ]
-    }
-   ],
-   "source": [
-    "#testing data\n",
-    "label_matrix_test = applier.apply(df_test)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 152,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#Model 1 : majority model (mm)\n",
-    "majority_model = MajorityLabelVoter()\n",
-    "\n",
-    "#training data\n",
-    "mm_preds_class_train = majority_model.predict(L=label_matrix_train)\n",
-    "mm_preds_proba_train = majority_model.predict_proba(L=label_matrix_train)\n",
-    "\n",
-    "#testing data\n",
-    "mm_preds_class_test = majority_model.predict(L=label_matrix_test)\n",
-    "mm_preds_proba_test = majority_model.predict_proba(L=label_matrix_test)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 153,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([ 1, -1,  1, ..., -1, -1, -1])"
-      ]
-     },
-     "execution_count": 153,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "mm_preds_class_train # only the 1s and 0s are labels. T-1s are abstains i.e. unlabeled data points"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 154,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[0. , 1. ],\n",
-       "       [0.5, 0.5],\n",
-       "       [0. , 1. ],\n",
-       "       ...,\n",
-       "       [0.5, 0.5],\n",
-       "       [0.5, 0.5],\n",
-       "       [0.5, 0.5]])"
-      ]
-     },
-     "execution_count": 154,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "mm_preds_proba_train"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 155,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Model 2:label model (lm)\n",
-    "\n",
-    "#call the model\n",
-    "label_model = LabelModel(cardinality=2, verbose=True)\n",
-    "\n",
-    "#fit the model\n",
-    "num_epochs = 1000\n",
-    "log_frequency = 100\n",
-    "random_seed = 1\n",
-    "label_model.fit(L_train=label_matrix_train, n_epochs=num_epochs, log_freq=log_frequency, seed=random_seed)\n",
-    "\n",
-    "#generate lables for training data\n",
-    "lm_preds_proba_train = label_model.predict_proba(label_matrix_train)\n",
-    "lm_preds_class_train = probs_to_preds(lm_preds_proba_train)\n",
-    "\n",
-    "#generate labels for testing data\n",
-    "lm_preds_proba_test = label_model.predict_proba(label_matrix_test)\n",
-    "lm_preds_class_test = probs_to_preds(lm_preds_proba_test)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 156,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Model 3 : Random Voter (rv)\n",
-    "\n",
-    "random_voter = RandomVoter()\n",
-    "\n",
-    "#training data\n",
-    "rv_preds_class_train = random_voter.predict(L=label_matrix_train)\n",
-    "rv_preds_proba_train = random_voter.predict_proba(L=label_matrix_train)\n",
-    "\n",
-    "#testing data\n",
-    "rv_preds_class_test = random_voter.predict(L=label_matrix_test)\n",
-    "rv_preds_proba_test = random_voter.predict_proba(L=label_matrix_test)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 2.8 Comparing different models"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 157,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Majority Model \n",
-      " Accuracy: \n",
-      " train-> 0.5635383924562192 \n",
-      " test-> 0.4703770197486535 \n",
-      " AUC: \n",
-      " train-> 0.6010212548732079 \n",
-      " test-> 0.5227059436913452 \n",
-      "\n",
-      "Label Model \n",
-      " Accuracy: \n",
-      " train-> 0.5527615626403233 \n",
-      " test-> 0.49012567324955114 \n",
-      " AUC: \n",
-      " train-> 0.524345344386498 \n",
-      " test-> 0.4437434827945777 \n",
-      "\n",
-      "Random Voter Model \n",
-      " Accuracy: \n",
-      " train-> 0.5024696901661428 \n",
-      " test-> 0.5008976660682226 \n",
-      " AUC: \n",
-      " train-> 0.5126309212678816 \n",
-      " test-> 0.5023114355231144 \n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "all_models = {'Majority Model':majority_model,\n",
-    "              'Label Model':label_model,\n",
-    "              'Random Voter Model':random_voter}\n",
-    "\n",
-    "for model_name,model in all_models.items():\n",
-    "    \n",
-    "    #accuracy\n",
-    "    train_acc = model.score(L=label_matrix_train, Y=y_train, tie_break_policy=\"random\")[\"accuracy\"]\n",
-    "    test_acc = model.score(L=label_matrix_test, Y=y_test, tie_break_policy=\"random\")[\"accuracy\"]\n",
-    "    \n",
-    "    #auc\n",
-    "    train_auc = metric_score(y_train, probs=model.predict_proba(L=label_matrix_train), metric='roc_auc')\n",
-    "    test_auc = metric_score(y_test, probs=model.predict_proba(L=label_matrix_test), metric='roc_auc')\n",
-    "    \n",
-    "    print(f'{model_name}','\\n',\n",
-    "          'Accuracy:','\\n','train->',train_acc,'\\n','test->',test_acc,'\\n',\n",
-    "          'AUC:','\\n','train->',train_auc,'\\n','test->',test_auc,'\\n')\n",
-    "    \n",
-    "    \n",
-    "    "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The label model has the highest test AUC so that's the  best model."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 3. Filter out unlabeled points"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 169,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Total points labelled in training data: 405\n",
-      "Total points labelled in testing data: 91\n"
-     ]
-    }
-   ],
-   "source": [
-    "#training labels\n",
-    "df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(\n",
-    "    X=df_train['text'], \n",
-    "    y=lm_preds_proba_train, \n",
-    "    L=label_matrix_train\n",
-    ")\n",
-    "\n",
-    "#testing labels\n",
-    "df_test_filtered, probs_test_filtered = filter_unlabeled_dataframe(\n",
-    "    X=df_test['text'], \n",
-    "    y=lm_preds_proba_test, \n",
-    "    L=label_matrix_test\n",
-    ")\n",
-    "\n",
-    "print('Total points labelled in training data:',len(df_train_filtered))\n",
-    "print('Total points labelled in testing data:',len(df_test_filtered))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# References\n",
-    "\n",
-    "https://www.snorkel.org/use-cases/spouse-demo\n",
-    "    \n",
-    "https://github.com/snorkel-team/snorkel-tutorials/blob/master/spouse/spouse_demo.ipynb\n",
-    "    \n",
-    "https://www.snorkel.org/use-cases/01-spam-tutorial\n",
-    "    \n",
-    "https://readthedocs.org/projects/snorkel/downloads/pdf/master/"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.4"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}

From 5d3751e9355b1181b5dd98315a9e4ac0a2d915a6 Mon Sep 17 00:00:00 2001
From: kritim13 <kriti.mahajan.13@gmail.com>
Date: Wed, 1 Jul 2020 22:42:35 +0530
Subject: [PATCH 3/3] 'snorkel'

---
 .../Snorkel RE example.ipynb                  | 1952 +++++++++++++++++
 .../snorkel_preprocessing_example.py          |   69 +
 2 files changed, 2021 insertions(+)
 create mode 100644 immunology_kg/notebooks/snorkel_re_example/Snorkel RE example.ipynb
 create mode 100644 immunology_kg/notebooks/snorkel_re_example/snorkel_preprocessing_example.py

diff --git a/immunology_kg/notebooks/snorkel_re_example/Snorkel RE example.ipynb b/immunology_kg/notebooks/snorkel_re_example/Snorkel RE example.ipynb
new file mode 100644
index 0000000..86b90af
--- /dev/null
+++ b/immunology_kg/notebooks/snorkel_re_example/Snorkel RE example.ipynb	
@@ -0,0 +1,1952 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from snorkel.preprocess.nlp import SpacyPreprocessor\n",
+    "from snorkel.labeling import PandasLFApplier,filter_unlabeled_dataframe,LFAnalysis ,labeling_function\n",
+    "from snorkel.labeling.model import MajorityClassVoter,MajorityLabelVoter,RandomVoter ,LabelModel\n",
+    "\n",
+    "\n",
+    "from snorkel.analysis import metric_score , get_label_buckets\n",
+    "\n",
+    "from snorkel.utils import probs_to_preds\n",
+    "\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "import pandas as pd\n",
+    "import re\n",
+    "import os\n",
+    "from collections import OrderedDict\n",
+    "\n",
+    "#importing self-defined helped modules\n",
+    "from snorkel_preprocessing_example import make_source_target_preprocessor,make_text_between_preprocessor"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 1. Load the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Unnamed: 0</th>\n",
+       "      <th>text</th>\n",
+       "      <th>source</th>\n",
+       "      <th>relation</th>\n",
+       "      <th>target</th>\n",
+       "      <th>link</th>\n",
+       "      <th>pmc_id</th>\n",
+       "      <th>doi_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>While blocking TPC2 activity by tetrandrine, a...</td>\n",
+       "      <td>{'(+)-Tetrandrine': {'namespace': 'chebi', 'na...</td>\n",
+       "      <td>negativeCorrelation</td>\n",
+       "      <td>{'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32221306.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Chemoinformatics searches yielded 15 approved ...</td>\n",
+       "      <td>{'(S)-verapamil': {'namespace': 'chebi', 'name...</td>\n",
+       "      <td>negativeCorrelation</td>\n",
+       "      <td>{'hypertension': {'namespace': 'doid', 'name':...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'db': 'DOI', ...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>https://doi.org/10.1101/2020.03.22.002386</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>Thyroid stimulating hormone and free triiodoth...</td>\n",
+       "      <td>{\"3,3',5'-triiodothyronine\": {'namespace': 'ch...</td>\n",
+       "      <td>negativeCorrelation</td>\n",
+       "      <td>{'COVID-19': {'namespace': 'doid', 'name': 'CO...</td>\n",
+       "      <td>{'annotations': {'mesh': {'D044967': True}}, '...</td>\n",
+       "      <td>32217556.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "      <td>Based on these results, we performed virtual d...</td>\n",
+       "      <td>{\"4'-epidoxorubicin\": {'namespace': 'chebi', '...</td>\n",
+       "      <td>decreases</td>\n",
+       "      <td>{'3.4.22.69': {'namespace': 'eccode', 'name': ...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32173287.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>4</td>\n",
+       "      <td>Doctors can also use a clinically approved bil...</td>\n",
+       "      <td>{'4-methylumbelliferone': {'namespace': 'chebi...</td>\n",
+       "      <td>decreases</td>\n",
+       "      <td>{'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',...</td>\n",
+       "      <td>{'annotations': {'mesh': {'D008168': True}}, '...</td>\n",
+       "      <td>32205856.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5</td>\n",
+       "      <td>5</td>\n",
+       "      <td>Since Vitamin B3 is highly lung protective, it...</td>\n",
+       "      <td>{'4-methylumbelliferone': {'namespace': 'chebi...</td>\n",
+       "      <td>decreases</td>\n",
+       "      <td>{'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32205856.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>6</td>\n",
+       "      <td>6</td>\n",
+       "      <td>Doctors can also use a clinically approved bil...</td>\n",
+       "      <td>{'4-methylumbelliferone': {'namespace': 'chebi...</td>\n",
+       "      <td>decreases</td>\n",
+       "      <td>{'inflammatory response': {'namespace': 'go', ...</td>\n",
+       "      <td>{'annotations': {'mesh': {'D008168': True}}, '...</td>\n",
+       "      <td>32205856.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Unnamed: 0                                               text  \\\n",
+       "0           0  While blocking TPC2 activity by tetrandrine, a...   \n",
+       "1           1  Chemoinformatics searches yielded 15 approved ...   \n",
+       "2           2  Thyroid stimulating hormone and free triiodoth...   \n",
+       "3           3  Based on these results, we performed virtual d...   \n",
+       "4           4  Doctors can also use a clinically approved bil...   \n",
+       "5           5  Since Vitamin B3 is highly lung protective, it...   \n",
+       "6           6  Doctors can also use a clinically approved bil...   \n",
+       "\n",
+       "                                              source             relation  \\\n",
+       "0  {'(+)-Tetrandrine': {'namespace': 'chebi', 'na...  negativeCorrelation   \n",
+       "1  {'(S)-verapamil': {'namespace': 'chebi', 'name...  negativeCorrelation   \n",
+       "2  {\"3,3',5'-triiodothyronine\": {'namespace': 'ch...  negativeCorrelation   \n",
+       "3  {\"4'-epidoxorubicin\": {'namespace': 'chebi', '...            decreases   \n",
+       "4  {'4-methylumbelliferone': {'namespace': 'chebi...            decreases   \n",
+       "5  {'4-methylumbelliferone': {'namespace': 'chebi...            decreases   \n",
+       "6  {'4-methylumbelliferone': {'namespace': 'chebi...            decreases   \n",
+       "\n",
+       "                                              target  \\\n",
+       "0  {'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...   \n",
+       "1  {'hypertension': {'namespace': 'doid', 'name':...   \n",
+       "2  {'COVID-19': {'namespace': 'doid', 'name': 'CO...   \n",
+       "3  {'3.4.22.69': {'namespace': 'eccode', 'name': ...   \n",
+       "4  {'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',...   \n",
+       "5  {'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',...   \n",
+       "6  {'inflammatory response': {'namespace': 'go', ...   \n",
+       "\n",
+       "                                                link      pmc_id  \\\n",
+       "0  {'annotations': {}, 'citation': {'authors': ['...  32221306.0   \n",
+       "1  {'annotations': {}, 'citation': {'db': 'DOI', ...         NaN   \n",
+       "2  {'annotations': {'mesh': {'D044967': True}}, '...  32217556.0   \n",
+       "3  {'annotations': {}, 'citation': {'authors': ['...  32173287.0   \n",
+       "4  {'annotations': {'mesh': {'D008168': True}}, '...  32205856.0   \n",
+       "5  {'annotations': {}, 'citation': {'authors': ['...  32205856.0   \n",
+       "6  {'annotations': {'mesh': {'D008168': True}}, '...  32205856.0   \n",
+       "\n",
+       "                                      doi_id  \n",
+       "0                                        NaN  \n",
+       "1  https://doi.org/10.1101/2020.03.22.002386  \n",
+       "2                                        NaN  \n",
+       "3                                        NaN  \n",
+       "4                                        NaN  \n",
+       "5                                        NaN  \n",
+       "6                                        NaN  "
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#'https://raw.githubusercontent.com/covid19kg/covid19kg/master/supplement/terminology.csv'\n",
+    "url = 'https://raw.githubusercontent.com/CoronaWhy/task-vt/kaleidoescape_kg/immunology_kg/relations/covid19_frauenhofer_annotations.csv'\n",
+    "pybel_pd = pd.read_csv(url)\n",
+    "pybel_pd.head(7)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['negativeCorrelation', 'decreases', 'regulates', 'increases',\n",
+       "       'positiveCorrelation', 'association', 'isA', 'biomarkerFor',\n",
+       "       'prognosticBiomarkerFor', 'causesNoChange'], dtype=object)"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#list all types of relations\n",
+    "relation_categories = pybel_pd['relation'].unique()\n",
+    "relation_categories"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 2. Snorkel Example\n",
+    "\n",
+    "For the purpose of this example we'll only focus on rows with 'negativeCorrelation' and 'positiveCorrelation' as their relations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\Cafral\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  \n",
+      "C:\\Users\\Cafral\\Anaconda3\\lib\\site-packages\\pandas\\core\\frame.py:4102: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  errors=errors,\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>text</th>\n",
+       "      <th>source</th>\n",
+       "      <th>relation</th>\n",
+       "      <th>target</th>\n",
+       "      <th>link</th>\n",
+       "      <th>pmc_id</th>\n",
+       "      <th>doi_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>0</td>\n",
+       "      <td>While blocking TPC2 activity by tetrandrine, a...</td>\n",
+       "      <td>{'(+)-Tetrandrine': {'namespace': 'chebi', 'na...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>{'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32221306.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>Chemoinformatics searches yielded 15 approved ...</td>\n",
+       "      <td>{'(S)-verapamil': {'namespace': 'chebi', 'name...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>{'hypertension': {'namespace': 'doid', 'name':...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'db': 'DOI', ...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>https://doi.org/10.1101/2020.03.22.002386</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>Thyroid stimulating hormone and free triiodoth...</td>\n",
+       "      <td>{\"3,3',5'-triiodothyronine\": {'namespace': 'ch...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>{'COVID-19': {'namespace': 'doid', 'name': 'CO...</td>\n",
+       "      <td>{'annotations': {'mesh': {'D044967': True}}, '...</td>\n",
+       "      <td>32217556.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>The administration of methylprednisolone appea...</td>\n",
+       "      <td>{'6-methylprednisolone': {'namespace': 'chebi'...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>{'Death': {'namespace': 'mesh', 'name': 'Death...</td>\n",
+       "      <td>{'annotations': {'doid': {'11394': True}}, 'ci...</td>\n",
+       "      <td>32167524.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>Adverse reactions of IFN-α mainly include low-...</td>\n",
+       "      <td>{'Interferon alfa-2a': {'namespace': 'chebi', ...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>{'Low-grade fever': {'namespace': 'hp', 'name'...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32166483.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                text  \\\n",
+       "0  While blocking TPC2 activity by tetrandrine, a...   \n",
+       "1  Chemoinformatics searches yielded 15 approved ...   \n",
+       "2  Thyroid stimulating hormone and free triiodoth...   \n",
+       "3  The administration of methylprednisolone appea...   \n",
+       "4  Adverse reactions of IFN-α mainly include low-...   \n",
+       "\n",
+       "                                              source  relation  \\\n",
+       "0  {'(+)-Tetrandrine': {'namespace': 'chebi', 'na...      True   \n",
+       "1  {'(S)-verapamil': {'namespace': 'chebi', 'name...      True   \n",
+       "2  {\"3,3',5'-triiodothyronine\": {'namespace': 'ch...      True   \n",
+       "3  {'6-methylprednisolone': {'namespace': 'chebi'...      True   \n",
+       "4  {'Interferon alfa-2a': {'namespace': 'chebi', ...     False   \n",
+       "\n",
+       "                                              target  \\\n",
+       "0  {'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...   \n",
+       "1  {'hypertension': {'namespace': 'doid', 'name':...   \n",
+       "2  {'COVID-19': {'namespace': 'doid', 'name': 'CO...   \n",
+       "3  {'Death': {'namespace': 'mesh', 'name': 'Death...   \n",
+       "4  {'Low-grade fever': {'namespace': 'hp', 'name'...   \n",
+       "\n",
+       "                                                link      pmc_id  \\\n",
+       "0  {'annotations': {}, 'citation': {'authors': ['...  32221306.0   \n",
+       "1  {'annotations': {}, 'citation': {'db': 'DOI', ...         NaN   \n",
+       "2  {'annotations': {'mesh': {'D044967': True}}, '...  32217556.0   \n",
+       "3  {'annotations': {'doid': {'11394': True}}, 'ci...  32167524.0   \n",
+       "4  {'annotations': {}, 'citation': {'authors': ['...  32166483.0   \n",
+       "\n",
+       "                                      doi_id  \n",
+       "0                                        NaN  \n",
+       "1  https://doi.org/10.1101/2020.03.22.002386  \n",
+       "2                                        NaN  \n",
+       "3                                        NaN  \n",
+       "4                                        NaN  "
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "example_data = pybel_pd[(pybel_pd['relation']=='negativeCorrelation') | (pybel_pd['relation']=='positiveCorrelation') ]\n",
+    "example_data['relation'] = example_data['relation']=='negativeCorrelation'\n",
+    "example_data.reset_index(inplace=True,drop=True)\n",
+    "example_data.drop('Unnamed: 0',inplace=True,axis=1)\n",
+    "example_data.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.1 Split the data into training and testing \n",
+    "\n",
+    "Ideally should have training , validation and testing set. Also, here , I'm using a fixed testing period but k-fold cross validation techniques are a more robust way of determining the accuracy of the generated labels."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train,df_test,y_train,y_test = train_test_split(example_data[['text']],\n",
+    "                                                   example_data[['relation']],\n",
+    "                                                   test_size=0.20,\n",
+    "                                                   shuffle=False,random_state=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.2 Reading sentences to understand syntactic differences betweem negative and positive correlation sentences\n",
+    "\n",
+    "The utility of snorkel is that it allows you to create multiple labelling functions which (try to) mimic the rules that a human annotator of data would apply while deciding how to label unlabelled data. For instance a human annotator looking to identify negative correlation is sentences will follow the following rules:\n",
+    "\n",
+    "##### A. Positive rules: rules which tell you what a sentences with negative correlation looks like\n",
+    "\n",
+    "1) does the sentence contain the reduction related words like words 'decreased','reduced','lowered'\n",
+    "\n",
+    "2) does the sentence contain the expression 'negatively correlated' or 'negative correlation' or 'negatively related' or 'inversely related' or 'inverse relation'\n",
+    "\n",
+    "3) does the syntax follow the structure 'increase in x is associated with a decrease in y' (and vice versa)\n",
+    "\n",
+    "4) does the sentence contain the expression 'negative effect'\n",
+    "\n",
+    "5) does the sentence contain the expression 'move in opposite directions'\n",
+    "\n",
+    "##### B. Negative rules: rules which tell you what a sentences without negative correlation looks like\n",
+    "\n",
+    "1) does the sentence contain the increase related words like words 'increased','improved'\n",
+    "\n",
+    "2) does the sentence contain the expression 'positively correlated' or 'positive correlation' or 'positively related'\n",
+    "\n",
+    "3) does the syntax follow the structure 'increase in x is associated with a increase in y' (and vice versa)\n",
+    "\n",
+    "4) does the sentence contain the expression 'positive effect'\n",
+    "\n",
+    "5) does the sentence contain the expression 'move in the same direction'\n",
+    "\n",
+    "These rules can be coded using snorkel. Importantly it requires both positive and negative rules. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 2.2.1 Examining Negative correlation sentences to understand their syntactic structure and then define labelling functions accordingly"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>text</th>\n",
+       "      <th>source</th>\n",
+       "      <th>relation</th>\n",
+       "      <th>target</th>\n",
+       "      <th>link</th>\n",
+       "      <th>pmc_id</th>\n",
+       "      <th>doi_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>0</td>\n",
+       "      <td>While blocking TPC2 activity by tetrandrine, a...</td>\n",
+       "      <td>{'(+)-Tetrandrine': {'namespace': 'chebi', 'na...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>{'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32221306.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>Chemoinformatics searches yielded 15 approved ...</td>\n",
+       "      <td>{'(S)-verapamil': {'namespace': 'chebi', 'name...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>{'hypertension': {'namespace': 'doid', 'name':...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'db': 'DOI', ...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>https://doi.org/10.1101/2020.03.22.002386</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>Thyroid stimulating hormone and free triiodoth...</td>\n",
+       "      <td>{\"3,3',5'-triiodothyronine\": {'namespace': 'ch...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>{'COVID-19': {'namespace': 'doid', 'name': 'CO...</td>\n",
+       "      <td>{'annotations': {'mesh': {'D044967': True}}, '...</td>\n",
+       "      <td>32217556.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>The administration of methylprednisolone appea...</td>\n",
+       "      <td>{'6-methylprednisolone': {'namespace': 'chebi'...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>{'Death': {'namespace': 'mesh', 'name': 'Death...</td>\n",
+       "      <td>{'annotations': {'doid': {'11394': True}}, 'ci...</td>\n",
+       "      <td>32167524.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>In our opinion, during the COVID-19 pandemic, ...</td>\n",
+       "      <td>{'adrenergic antagonist': {'namespace': 'chebi...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>{'COVID-19': {'namespace': 'doid', 'name': 'CO...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32220710.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5</td>\n",
+       "      <td>Consistent with previous reports, 20mM NH4Cl a...</td>\n",
+       "      <td>{'ammonium chloride': {'namespace': 'chebi', '...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>{'G protein, vesicular stomatitis virus': {'na...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32221306.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>6</td>\n",
+       "      <td>If the latter percentage would be found to be ...</td>\n",
+       "      <td>{'angiotensin receptor antagonist': {'namespac...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>{'COVID-19': {'namespace': 'doid', 'name': 'CO...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32129518.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>7</td>\n",
+       "      <td>Consistent with previous reports, 20mM NH4Cl a...</td>\n",
+       "      <td>{'bafilomycin A1': {'namespace': 'chebi', 'nam...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>{'G protein, vesicular stomatitis virus': {'na...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32221306.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                text  \\\n",
+       "0  While blocking TPC2 activity by tetrandrine, a...   \n",
+       "1  Chemoinformatics searches yielded 15 approved ...   \n",
+       "2  Thyroid stimulating hormone and free triiodoth...   \n",
+       "3  The administration of methylprednisolone appea...   \n",
+       "4  In our opinion, during the COVID-19 pandemic, ...   \n",
+       "5  Consistent with previous reports, 20mM NH4Cl a...   \n",
+       "6  If the latter percentage would be found to be ...   \n",
+       "7  Consistent with previous reports, 20mM NH4Cl a...   \n",
+       "\n",
+       "                                              source  relation  \\\n",
+       "0  {'(+)-Tetrandrine': {'namespace': 'chebi', 'na...      True   \n",
+       "1  {'(S)-verapamil': {'namespace': 'chebi', 'name...      True   \n",
+       "2  {\"3,3',5'-triiodothyronine\": {'namespace': 'ch...      True   \n",
+       "3  {'6-methylprednisolone': {'namespace': 'chebi'...      True   \n",
+       "4  {'adrenergic antagonist': {'namespace': 'chebi...      True   \n",
+       "5  {'ammonium chloride': {'namespace': 'chebi', '...      True   \n",
+       "6  {'angiotensin receptor antagonist': {'namespac...      True   \n",
+       "7  {'bafilomycin A1': {'namespace': 'chebi', 'nam...      True   \n",
+       "\n",
+       "                                              target  \\\n",
+       "0  {'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...   \n",
+       "1  {'hypertension': {'namespace': 'doid', 'name':...   \n",
+       "2  {'COVID-19': {'namespace': 'doid', 'name': 'CO...   \n",
+       "3  {'Death': {'namespace': 'mesh', 'name': 'Death...   \n",
+       "4  {'COVID-19': {'namespace': 'doid', 'name': 'CO...   \n",
+       "5  {'G protein, vesicular stomatitis virus': {'na...   \n",
+       "6  {'COVID-19': {'namespace': 'doid', 'name': 'CO...   \n",
+       "7  {'G protein, vesicular stomatitis virus': {'na...   \n",
+       "\n",
+       "                                                link      pmc_id  \\\n",
+       "0  {'annotations': {}, 'citation': {'authors': ['...  32221306.0   \n",
+       "1  {'annotations': {}, 'citation': {'db': 'DOI', ...         NaN   \n",
+       "2  {'annotations': {'mesh': {'D044967': True}}, '...  32217556.0   \n",
+       "3  {'annotations': {'doid': {'11394': True}}, 'ci...  32167524.0   \n",
+       "4  {'annotations': {}, 'citation': {'authors': ['...  32220710.0   \n",
+       "5  {'annotations': {}, 'citation': {'authors': ['...  32221306.0   \n",
+       "6  {'annotations': {}, 'citation': {'authors': ['...  32129518.0   \n",
+       "7  {'annotations': {}, 'citation': {'authors': ['...  32221306.0   \n",
+       "\n",
+       "                                      doi_id  \n",
+       "0                                        NaN  \n",
+       "1  https://doi.org/10.1101/2020.03.22.002386  \n",
+       "2                                        NaN  \n",
+       "3                                        NaN  \n",
+       "4                                        NaN  \n",
+       "5                                        NaN  \n",
+       "6                                        NaN  \n",
+       "7                                        NaN  "
+      ]
+     },
+     "execution_count": 51,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "neg_correl_df = example_data[example_data['relation']==1]\n",
+    "neg_correl_df.reset_index(inplace=True,drop=True)\n",
+    "neg_correl_df.head(8)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'While blocking TPC2 activity by tetrandrine, an inhibitor for TPC237, decreased entry of SARS-CoV-2 S pseudovirions (Fig. 3f), treatment of cells with 130, a TRPML1 inhibitor, had no effect (Supplementary Fig. 1), indicating that TPC2, not TRPML1, is important for SARS-CoV-2 entry.'"
+      ]
+     },
+     "execution_count": 52,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "neg_correl_df['text'][0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 2.2.2 Examining positive correlation sentences to understand their syntactic structure and then define labelling funtions accordingly"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>text</th>\n",
+       "      <th>source</th>\n",
+       "      <th>relation</th>\n",
+       "      <th>target</th>\n",
+       "      <th>link</th>\n",
+       "      <th>pmc_id</th>\n",
+       "      <th>doi_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>0</td>\n",
+       "      <td>Adverse reactions of IFN-α mainly include low-...</td>\n",
+       "      <td>{'Interferon alfa-2a': {'namespace': 'chebi', ...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>{'Low-grade fever': {'namespace': 'hp', 'name'...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32166483.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>Adverse reactions of IFN-α mainly include low-...</td>\n",
+       "      <td>{'Interferon alfa-2a': {'namespace': 'chebi', ...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>{'influenza': {'namespace': 'doid', 'name': 'i...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32166483.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>This may be accounted for by two complementary...</td>\n",
+       "      <td>{'angiotensin II': {'namespace': 'chebi', 'nam...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>{'COVID-19': {'namespace': 'doid', 'name': 'CO...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32129518.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>ACE2 can also antagonize cardiac fibrosis and ...</td>\n",
+       "      <td>{'angiotensin II': {'namespace': 'chebi', 'nam...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>{'Ventricular Remodeling': {'namespace': 'mesh...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32221983.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>ACE2 can also antagonize cardiac fibrosis and ...</td>\n",
+       "      <td>{'angiotensin II': {'namespace': 'chebi', 'nam...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>{'Myocardial fibrosis': {'namespace': 'hp', 'n...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32221983.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5</td>\n",
+       "      <td>The existence of significantly increased fibri...</td>\n",
+       "      <td>{'Fibrin': {'namespace': 'chebi', 'name': 'Fib...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>{'Hyperfibrinolysis': {'namespace': 'hp', 'nam...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32216698.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>6</td>\n",
+       "      <td>This opinion is supported by the presence of h...</td>\n",
+       "      <td>{'Fibrin': {'namespace': 'chebi', 'name': 'Fib...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>{'Hemorrhage': {'namespace': 'mesh', 'name': '...</td>\n",
+       "      <td>{'annotations': {}, 'citation': {'authors': ['...</td>\n",
+       "      <td>32216698.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>7</td>\n",
+       "      <td>In the influenza virus model, it was reported ...</td>\n",
+       "      <td>{'chloroquine': {'namespace': 'chebi', 'name':...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>{'dendritic cell antigen processing and presen...</td>\n",
+       "      <td>{'annotations': {'mesh': {'D007251': True}}, '...</td>\n",
+       "      <td>32171740.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                text  \\\n",
+       "0  Adverse reactions of IFN-α mainly include low-...   \n",
+       "1  Adverse reactions of IFN-α mainly include low-...   \n",
+       "2  This may be accounted for by two complementary...   \n",
+       "3  ACE2 can also antagonize cardiac fibrosis and ...   \n",
+       "4  ACE2 can also antagonize cardiac fibrosis and ...   \n",
+       "5  The existence of significantly increased fibri...   \n",
+       "6  This opinion is supported by the presence of h...   \n",
+       "7  In the influenza virus model, it was reported ...   \n",
+       "\n",
+       "                                              source  relation  \\\n",
+       "0  {'Interferon alfa-2a': {'namespace': 'chebi', ...     False   \n",
+       "1  {'Interferon alfa-2a': {'namespace': 'chebi', ...     False   \n",
+       "2  {'angiotensin II': {'namespace': 'chebi', 'nam...     False   \n",
+       "3  {'angiotensin II': {'namespace': 'chebi', 'nam...     False   \n",
+       "4  {'angiotensin II': {'namespace': 'chebi', 'nam...     False   \n",
+       "5  {'Fibrin': {'namespace': 'chebi', 'name': 'Fib...     False   \n",
+       "6  {'Fibrin': {'namespace': 'chebi', 'name': 'Fib...     False   \n",
+       "7  {'chloroquine': {'namespace': 'chebi', 'name':...     False   \n",
+       "\n",
+       "                                              target  \\\n",
+       "0  {'Low-grade fever': {'namespace': 'hp', 'name'...   \n",
+       "1  {'influenza': {'namespace': 'doid', 'name': 'i...   \n",
+       "2  {'COVID-19': {'namespace': 'doid', 'name': 'CO...   \n",
+       "3  {'Ventricular Remodeling': {'namespace': 'mesh...   \n",
+       "4  {'Myocardial fibrosis': {'namespace': 'hp', 'n...   \n",
+       "5  {'Hyperfibrinolysis': {'namespace': 'hp', 'nam...   \n",
+       "6  {'Hemorrhage': {'namespace': 'mesh', 'name': '...   \n",
+       "7  {'dendritic cell antigen processing and presen...   \n",
+       "\n",
+       "                                                link      pmc_id doi_id  \n",
+       "0  {'annotations': {}, 'citation': {'authors': ['...  32166483.0    NaN  \n",
+       "1  {'annotations': {}, 'citation': {'authors': ['...  32166483.0    NaN  \n",
+       "2  {'annotations': {}, 'citation': {'authors': ['...  32129518.0    NaN  \n",
+       "3  {'annotations': {}, 'citation': {'authors': ['...  32221983.0    NaN  \n",
+       "4  {'annotations': {}, 'citation': {'authors': ['...  32221983.0    NaN  \n",
+       "5  {'annotations': {}, 'citation': {'authors': ['...  32216698.0    NaN  \n",
+       "6  {'annotations': {}, 'citation': {'authors': ['...  32216698.0    NaN  \n",
+       "7  {'annotations': {'mesh': {'D007251': True}}, '...  32171740.0    NaN  "
+      ]
+     },
+     "execution_count": 53,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "positive_relation_df = example_data[example_data['relation']==0]\n",
+    "positive_relation_df.reset_index(inplace=True,drop=True)\n",
+    "positive_relation_df.head(8)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Adverse reactions of IFN-α mainly include low-grade fever and flu-like symptoms (both in children with intramuscularly injection) [11].'"
+      ]
+     },
+     "execution_count": 54,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "positive_relation_df['text'][0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.3 Source-Target dictionary"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "A simple but clean rule for identifying negative correlation sentences would be if negative tokens occured in the words between the source and the target. So, a source-target dictonary is created for some of the examples (in the final pipeline the source target dictonary will be obtained from the spacy pipeline)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('tetrandrine', 'TPC2'),\n",
+       " ('triiodothyronine', 'recovered'),\n",
+       " ('methylprednisolone', 'death'),\n",
+       " ('IFN-α', 'fever'),\n",
+       " ('angiotensin', 'vasodilator'),\n",
+       " ('ACE2', 'Ang'),\n",
+       " ('fibrin', 'COVID-19'),\n",
+       " ('hemorrhage', 'fibrinolysis'),\n",
+       " ('chloroquine', 'dendritic')]"
+      ]
+     },
+     "execution_count": 55,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "example_sources = ['tetrandrine','triiodothyronine','methylprednisolone','IFN-α','angiotensin','ACE2','fibrin','hemorrhage','chloroquine']\n",
+    "\n",
+    "example_targets = ['TPC2','recovered','death','fever','vasodilator','Ang','COVID-19','fibrinolysis','dendritic'] #low-grade fever\n",
+    "\n",
+    "example_source_target_dict = list(OrderedDict.fromkeys(zip(example_sources,example_targets)))\n",
+    "example_source_target_dict"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.4 Labeling functions for RE"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spacy = SpacyPreprocessor(text_field=\"text\", doc_field=\"doc\", memoize=True)\n",
+    "\n",
+    "ABSTAIN = -1\n",
+    "NOT_FOUND = 0\n",
+    "FOUND = 1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 2.4.1 Preprocessing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sentence:  While blocking TPC2 activity by tetrandrine, an inhibitor for TPC237, decreased entry of SARS-CoV-2 S pseudovirions (Fig. 3f), treatment of cells with 130, a TRPML1 inhibitor, had no effect (Supplementary Fig. 1), indicating that TPC2, not TRPML1, is important for SARS-CoV-2 entry. \n",
+      "\n",
+      "Source-target pair:  ('tetrandrine', 'TPC2')\n"
+     ]
+    }
+   ],
+   "source": [
+    "get_source_target = make_source_target_preprocessor(spacy, example_sources, example_targets)\n",
+    "\n",
+    "candidate = example_data.loc[0]\n",
+    "candidate_with_function_applied = get_source_target(candidate) \n",
+    "\n",
+    "print(\"Sentence: \", candidate[\"text\"],'\\n',)\n",
+    "print(\"Source-target pair: \", candidate_with_function_applied.source_target)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sentence:  Thyroid stimulating hormone and free triiodothyronine concentrations were significantly lower in deceased patients (0.7 mIU/mL and 2.8 pmol/L) than in recovered patients (1.4 mIU/mL and 4.3 pmol/L). \n",
+      "\n",
+      "Text Between:  triiodothyronine concentrations were significantly lower in deceased patients (0.7 mIU/mL and 2.8 pmol/L) than in \n",
+      "\n",
+      "Text to the left:  Thyroid stimulating hormone and free \n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "get_text_between = make_text_between_preprocessor(spacy, example_sources, example_targets)\n",
+    "\n",
+    "############ function example ###############################\n",
+    "candidate = example_data.loc[2]\n",
+    "\n",
+    "candidate_with_function_applied = get_text_between(candidate)\n",
+    "\n",
+    "print(\"Sentence: \", candidate[\"text\"],'\\n')\n",
+    "print(\"Text Between: \", candidate_with_function_applied.text_between,'\\n')\n",
+    "print(\"Text to the left: \", candidate_with_function_applied.text_to_source_left,'\\n')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 2.4.2 Defining the labelling functions (lf)\n",
+    "\n",
+    "For the final labelling model to work, at least 3 rules are needed.\n",
+    "\n",
+    "##### A. Positive rules: rules which tell you what a sentences with negative correlation looks like\n",
+    "    \n",
+    "1) does the sentence contain the reduction related words like words 'decreased','reduced','lowered'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reduction_tokens = {'decreased',\n",
+    "                            'lower',\n",
+    "                            'reduced',\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@labeling_function(pre=[spacy],resources=dict(reduction_tokens=reduction_tokens))\n",
+    "def contains_reduction_tokens(x,reduction_tokens):\n",
+    "    \n",
+    "    tokens = [str(token) for token in x.doc]\n",
+    "    return FOUND if len(reduction_tokens.intersection(set(tokens))) > 0 else ABSTAIN\n",
+    "\n",
+    "#positive rule - version 2\n",
+    "@labeling_function(pre=[spacy,get_text_between],resources=dict(reduction_tokens=reduction_tokens))\n",
+    "def contains_reduction_tokens_text_between(x,reduction_tokens):\n",
+    "    relation_text = x.text_between\n",
+    "    relation_text_tokens = [str(token) for token in relation_text]\n",
+    "    return FOUND if len(reduction_tokens.intersection(set(relation_text_tokens))) > 0 else ABSTAIN"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "2) does the sentence contain the expression 'negatively correlated' or 'negative correlation' or 'negatively related' or 'inversely related' or 'inverse relation'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "negative_correlation_regex_1 = 'negative correlation'\n",
+    "negative_correlation_regex_2 = 'negatively correlated'\n",
+    "negative_correlation_regex_3 = 'negatively related'\n",
+    "negative_correlation_regex_4 = 'inversely related'\n",
+    "negative_correlation_regex_5 = 'inverse relation'\n",
+    "negative_correlation_regex_6 = 'negative effect'\n",
+    "negative_correlation_regex_7 = 'move in opposite directions'\n",
+    "\n",
+    "@labeling_function()\n",
+    "def contains_negative_corrrelation_regex(x):\n",
+    "    if re.search(negative_correlation_regex_1, x.text, flags=re.I):\n",
+    "        return FOUND\n",
+    "    elif re.search(negative_correlation_regex_2, x.text, flags=re.I):\n",
+    "        return FOUND\n",
+    "    elif re.search(negative_correlation_regex_3, x.text, flags=re.I):\n",
+    "        return FOUND\n",
+    "    elif re.search(negative_correlation_regex_4, x.text, flags=re.I):\n",
+    "        return FOUND\n",
+    "    elif re.search(negative_correlation_regex_5, x.text, flags=re.I):\n",
+    "        return FOUND\n",
+    "    elif re.search(negative_correlation_regex_6, x.text, flags=re.I):\n",
+    "        return FOUND\n",
+    "    elif re.search(negative_correlation_regex_7, x.text, flags=re.I):\n",
+    "        return FOUND\n",
+    "\n",
+    "    else: \n",
+    "        return ABSTAIN\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "3) does the syntax follow the structure 'increase in x is associated with a decrease in y' (and vice versa)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@labeling_function(pre=[spacy,get_text_between])\n",
+    "def contains_increase_decrease_pattern(x):\n",
+    "    if ('increase' in [str(token) for token in x.text_to_source_left]) & ('decrease' in [str(token) for token in x.text_between]):\n",
+    "        return FOUND\n",
+    "    elif ('decrease' in [str(token) for token in x.text_to_source_left]) & ('increase' in [str(token) for token in x.text_between]):\n",
+    "        return FOUND\n",
+    "    else:\n",
+    "        return ABSTAIN"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### B. Negative rules: rules which tell you what a sentences without negative correlation looks like\n",
+    "\n",
+    "1) does the sentence contain the increase related words like words 'increased','higher'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "increase_tokens = {'increased',\n",
+    "                            'higher',\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@labeling_function(pre=[spacy],resources=dict(increase_tokens=increase_tokens))\n",
+    "def contains_increase_tokens(x,increase_tokens):\n",
+    "    tokens = [str(token) for token in x.doc]\n",
+    "    return NOT_FOUND if len(increase_tokens.intersection(set(tokens))) > 0 else ABSTAIN\n",
+    "\n",
+    "\n",
+    "@labeling_function(pre=[spacy,get_text_between],resources=dict(increase_tokens=increase_tokens))\n",
+    "def contains_increase_tokens_text_between(x, increase_tokens):\n",
+    "    relation_text = x.text_between\n",
+    "    relation_text_tokens = [str(token) for token in relation_text]\n",
+    "    return NOT_FOUND if len(increase_tokens.intersection(set(relation_text_tokens))) > 0 else ABSTAIN"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "2) does the sentence contain the expression 'positively correlated' or 'positive correlation' or 'positively related' or 'positive effect' or 'move in the same direction'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#regex\n",
+    "positive_correlation_regex_1 = 'positive correlation'\n",
+    "positive_correlation_regex_2 = 'positively correlated'\n",
+    "positive_correlation_regex_3 = 'positively related'\n",
+    "positive_correlation_regex_4 = 'positive effect'\n",
+    "positive_correlation_regex_5 = 'move in the same direction'\n",
+    "\n",
+    "@labeling_function()\n",
+    "def contains_positive_corrrelation_regex(x):\n",
+    "    if re.search(positive_correlation_regex_1, x.text, flags=re.I):\n",
+    "        return FOUND\n",
+    "    elif re.search(positive_correlation_regex_2, x.text, flags=re.I):\n",
+    "        return FOUND\n",
+    "    elif re.search(positive_correlation_regex_3, x.text, flags=re.I):\n",
+    "        return FOUND\n",
+    "    elif re.search(positive_correlation_regex_4, x.text, flags=re.I):\n",
+    "        return FOUND\n",
+    "    elif re.search(positive_correlation_regex_5, x.text, flags=re.I):\n",
+    "        return FOUND    \n",
+    "    else:\n",
+    "        return ABSTAIN"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "3) does the syntax follow the structure 'increase in x is associated with a increase in y' (and vice versa)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@labeling_function(pre=[spacy,get_text_between])\n",
+    "def contains_increase_increase_pattern(x):\n",
+    "    if ('increase' in [str(token) for token in x.text_to_source_left]) & ('increase' in [str(token) for token in x.text_between]):\n",
+    "        return FOUND\n",
+    "    elif ('decrease' in [str(token) for token in x.text_to_source_left]) & ('decrease' in [str(token) for token in x.text_between]):\n",
+    "        return FOUND\n",
+    "    else:\n",
+    "        return ABSTAIN"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.5 Creating all the labels for the different rules"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\Cafral\\Anaconda3\\lib\\site-packages\\tqdm\\std.py:658: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n",
+      "  from pandas import Panel\n",
+      "100%|█████████████████████████████████████████████████████████████████████████████| 2227/2227 [00:10<00:00, 213.19it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "label_functions_list = [contains_reduction_tokens,\n",
+    "                        contains_reduction_tokens_text_between,\n",
+    "                        contains_negative_corrrelation_regex,\n",
+    "                        contains_increase_decrease_pattern,\n",
+    "                        contains_increase_tokens,\n",
+    "                        contains_increase_tokens_text_between,\n",
+    "                        contains_positive_corrrelation_regex,\n",
+    "                        contains_increase_increase_pattern\n",
+    "                       ]\n",
+    "\n",
+    "applier = PandasLFApplier(label_functions_list)\n",
+    "\n",
+    "label_matrix_train = applier.apply(df_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.6 Examining the quality of the labels"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>j</th>\n",
+       "      <th>Polarity</th>\n",
+       "      <th>Coverage</th>\n",
+       "      <th>Overlaps</th>\n",
+       "      <th>Conflicts</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>contains_reduction_tokens</td>\n",
+       "      <td>0</td>\n",
+       "      <td>[1]</td>\n",
+       "      <td>0.057027</td>\n",
+       "      <td>0.019308</td>\n",
+       "      <td>0.016165</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>contains_reduction_tokens_text_between</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[1]</td>\n",
+       "      <td>0.004041</td>\n",
+       "      <td>0.004041</td>\n",
+       "      <td>0.000898</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>contains_negative_corrrelation_regex</td>\n",
+       "      <td>2</td>\n",
+       "      <td>[1]</td>\n",
+       "      <td>0.008532</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>contains_increase_decrease_pattern</td>\n",
+       "      <td>3</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>contains_increase_tokens</td>\n",
+       "      <td>4</td>\n",
+       "      <td>[0]</td>\n",
+       "      <td>0.130220</td>\n",
+       "      <td>0.016165</td>\n",
+       "      <td>0.016165</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>contains_increase_tokens_text_between</td>\n",
+       "      <td>5</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>contains_positive_corrrelation_regex</td>\n",
+       "      <td>6</td>\n",
+       "      <td>[1]</td>\n",
+       "      <td>0.002245</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>contains_increase_increase_pattern</td>\n",
+       "      <td>7</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                        j Polarity  Coverage  Overlaps  \\\n",
+       "contains_reduction_tokens               0      [1]  0.057027  0.019308   \n",
+       "contains_reduction_tokens_text_between  1      [1]  0.004041  0.004041   \n",
+       "contains_negative_corrrelation_regex    2      [1]  0.008532  0.000000   \n",
+       "contains_increase_decrease_pattern      3       []  0.000000  0.000000   \n",
+       "contains_increase_tokens                4      [0]  0.130220  0.016165   \n",
+       "contains_increase_tokens_text_between   5       []  0.000000  0.000000   \n",
+       "contains_positive_corrrelation_regex    6      [1]  0.002245  0.000000   \n",
+       "contains_increase_increase_pattern      7       []  0.000000  0.000000   \n",
+       "\n",
+       "                                        Conflicts  \n",
+       "contains_reduction_tokens                0.016165  \n",
+       "contains_reduction_tokens_text_between   0.000898  \n",
+       "contains_negative_corrrelation_regex     0.000000  \n",
+       "contains_increase_decrease_pattern       0.000000  \n",
+       "contains_increase_tokens                 0.016165  \n",
+       "contains_increase_tokens_text_between    0.000000  \n",
+       "contains_positive_corrrelation_regex     0.000000  \n",
+       "contains_increase_increase_pattern       0.000000  "
+      ]
+     },
+     "execution_count": 68,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#in the absence of a benchmark to compare against\n",
+    "LFAnalysis(L=label_matrix_train, lfs=label_functions_list).lf_summary()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>j</th>\n",
+       "      <th>Polarity</th>\n",
+       "      <th>Coverage</th>\n",
+       "      <th>Overlaps</th>\n",
+       "      <th>Conflicts</th>\n",
+       "      <th>Correct</th>\n",
+       "      <th>Incorrect</th>\n",
+       "      <th>Emp. Acc.</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>contains_reduction_tokens</td>\n",
+       "      <td>0</td>\n",
+       "      <td>[1]</td>\n",
+       "      <td>0.057027</td>\n",
+       "      <td>0.019308</td>\n",
+       "      <td>0.016165</td>\n",
+       "      <td>69</td>\n",
+       "      <td>58</td>\n",
+       "      <td>0.543307</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>contains_reduction_tokens_text_between</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[1]</td>\n",
+       "      <td>0.004041</td>\n",
+       "      <td>0.004041</td>\n",
+       "      <td>0.000898</td>\n",
+       "      <td>8</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.888889</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>contains_negative_corrrelation_regex</td>\n",
+       "      <td>2</td>\n",
+       "      <td>[1]</td>\n",
+       "      <td>0.008532</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>18</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.947368</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>contains_increase_decrease_pattern</td>\n",
+       "      <td>3</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>contains_increase_tokens</td>\n",
+       "      <td>4</td>\n",
+       "      <td>[0]</td>\n",
+       "      <td>0.130220</td>\n",
+       "      <td>0.016165</td>\n",
+       "      <td>0.016165</td>\n",
+       "      <td>263</td>\n",
+       "      <td>27</td>\n",
+       "      <td>0.906897</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>contains_increase_tokens_text_between</td>\n",
+       "      <td>5</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>contains_positive_corrrelation_regex</td>\n",
+       "      <td>6</td>\n",
+       "      <td>[1]</td>\n",
+       "      <td>0.002245</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0</td>\n",
+       "      <td>5</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>contains_increase_increase_pattern</td>\n",
+       "      <td>7</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                        j Polarity  Coverage  Overlaps  \\\n",
+       "contains_reduction_tokens               0      [1]  0.057027  0.019308   \n",
+       "contains_reduction_tokens_text_between  1      [1]  0.004041  0.004041   \n",
+       "contains_negative_corrrelation_regex    2      [1]  0.008532  0.000000   \n",
+       "contains_increase_decrease_pattern      3       []  0.000000  0.000000   \n",
+       "contains_increase_tokens                4      [0]  0.130220  0.016165   \n",
+       "contains_increase_tokens_text_between   5       []  0.000000  0.000000   \n",
+       "contains_positive_corrrelation_regex    6      [1]  0.002245  0.000000   \n",
+       "contains_increase_increase_pattern      7       []  0.000000  0.000000   \n",
+       "\n",
+       "                                        Conflicts  Correct  Incorrect  \\\n",
+       "contains_reduction_tokens                0.016165       69         58   \n",
+       "contains_reduction_tokens_text_between   0.000898        8          1   \n",
+       "contains_negative_corrrelation_regex     0.000000       18          1   \n",
+       "contains_increase_decrease_pattern       0.000000        0          0   \n",
+       "contains_increase_tokens                 0.016165      263         27   \n",
+       "contains_increase_tokens_text_between    0.000000        0          0   \n",
+       "contains_positive_corrrelation_regex     0.000000        0          5   \n",
+       "contains_increase_increase_pattern       0.000000        0          0   \n",
+       "\n",
+       "                                        Emp. Acc.  \n",
+       "contains_reduction_tokens                0.543307  \n",
+       "contains_reduction_tokens_text_between   0.888889  \n",
+       "contains_negative_corrrelation_regex     0.947368  \n",
+       "contains_increase_decrease_pattern       0.000000  \n",
+       "contains_increase_tokens                 0.906897  \n",
+       "contains_increase_tokens_text_between    0.000000  \n",
+       "contains_positive_corrrelation_regex     0.000000  \n",
+       "contains_increase_increase_pattern       0.000000  "
+      ]
+     },
+     "execution_count": 69,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#examining the quality of the labels in the presence of a benchmark to compare against\n",
+    "LFAnalysis(L=label_matrix_train, lfs=label_functions_list).lf_summary(y_train.values)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>0</td>\n",
+       "      <td>While blocking TPC2 activity by tetrandrine, a...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>Thyroid stimulating hormone and free triiodoth...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>The administration of methylprednisolone appea...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>7</td>\n",
+       "      <td>Consistent with previous reports, 20mM NH4Cl a...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>12</td>\n",
+       "      <td>Consistent with previous reports, 20mM NH4Cl a...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1644</td>\n",
+       "      <td>Actual bicarbonate and total carbon dioxide co...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1655</td>\n",
+       "      <td>Albumin concentrations were significantly lowe...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1657</td>\n",
+       "      <td>Moreover, the frequencies of regulatory T cell...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1658</td>\n",
+       "      <td>The reduced expressions of interferon-γ (IFN-γ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1668</td>\n",
+       "      <td>Spleen atrophy was observed in all reported ca...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>127 rows × 1 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                   text\n",
+       "0     While blocking TPC2 activity by tetrandrine, a...\n",
+       "2     Thyroid stimulating hormone and free triiodoth...\n",
+       "3     The administration of methylprednisolone appea...\n",
+       "7     Consistent with previous reports, 20mM NH4Cl a...\n",
+       "12    Consistent with previous reports, 20mM NH4Cl a...\n",
+       "...                                                 ...\n",
+       "1644  Actual bicarbonate and total carbon dioxide co...\n",
+       "1655  Albumin concentrations were significantly lowe...\n",
+       "1657  Moreover, the frequencies of regulatory T cell...\n",
+       "1658  The reduced expressions of interferon-γ (IFN-γ...\n",
+       "1668  Spleen atrophy was observed in all reported ca...\n",
+       "\n",
+       "[127 rows x 1 columns]"
+      ]
+     },
+     "execution_count": 70,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#examine which sentences were picked up as showing negative correlation by each label function\n",
+    "df_train.iloc[label_matrix_train[:, 0] == FOUND]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.7 Predict the final label\n",
+    "\n",
+    "Different models can be used to create the final model that aggrateges the different label functions to perdict the final lebel."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\Cafral\\Anaconda3\\lib\\site-packages\\tqdm\\std.py:658: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n",
+      "  from pandas import Panel\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████| 557/557 [00:01<00:00, 282.64it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "#testing data\n",
+    "label_matrix_test = applier.apply(df_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Model 1 : majority model (mm)\n",
+    "majority_model = MajorityLabelVoter()\n",
+    "\n",
+    "#training data\n",
+    "mm_preds_class_train = majority_model.predict(L=label_matrix_train)\n",
+    "mm_preds_proba_train = majority_model.predict_proba(L=label_matrix_train)\n",
+    "\n",
+    "#testing data\n",
+    "mm_preds_class_test = majority_model.predict(L=label_matrix_test)\n",
+    "mm_preds_proba_test = majority_model.predict_proba(L=label_matrix_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([ 1, -1,  1, ..., -1, -1, -1])"
+      ]
+     },
+     "execution_count": 73,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mm_preds_class_train # only the 1s and 0s are labels. T-1s are abstains i.e. unlabeled data points"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 74,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[0. , 1. ],\n",
+       "       [0.5, 0.5],\n",
+       "       [0. , 1. ],\n",
+       "       ...,\n",
+       "       [0.5, 0.5],\n",
+       "       [0.5, 0.5],\n",
+       "       [0.5, 0.5]])"
+      ]
+     },
+     "execution_count": 74,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mm_preds_proba_train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 75,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Model 2:label model (lm)\n",
+    "\n",
+    "#call the model\n",
+    "label_model = LabelModel(cardinality=2, verbose=True)\n",
+    "\n",
+    "#fit the model\n",
+    "num_epochs = 1000\n",
+    "log_frequency = 100\n",
+    "random_seed = 1\n",
+    "label_model.fit(L_train=label_matrix_train, n_epochs=num_epochs, log_freq=log_frequency, seed=random_seed)\n",
+    "\n",
+    "#generate lables for training data\n",
+    "lm_preds_proba_train = label_model.predict_proba(label_matrix_train)\n",
+    "lm_preds_class_train = probs_to_preds(lm_preds_proba_train)\n",
+    "\n",
+    "#generate labels for testing data\n",
+    "lm_preds_proba_test = label_model.predict_proba(label_matrix_test)\n",
+    "lm_preds_class_test = probs_to_preds(lm_preds_proba_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Model 3 : Random Voter (rv)\n",
+    "\n",
+    "random_voter = RandomVoter()\n",
+    "\n",
+    "#training data\n",
+    "rv_preds_class_train = random_voter.predict(L=label_matrix_train)\n",
+    "rv_preds_proba_train = random_voter.predict_proba(L=label_matrix_train)\n",
+    "\n",
+    "#testing data\n",
+    "rv_preds_class_test = random_voter.predict(L=label_matrix_test)\n",
+    "rv_preds_proba_test = random_voter.predict_proba(L=label_matrix_test)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.8 Comparing different models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Majority Model \n",
+      " Accuracy: \n",
+      " train-> 0.5635383924562192 \n",
+      " test-> 0.4703770197486535 \n",
+      " AUC: \n",
+      " train-> 0.6010212548732079 \n",
+      " test-> 0.5227059436913452 \n",
+      "\n",
+      "Label Model \n",
+      " Accuracy: \n",
+      " train-> 0.5527615626403233 \n",
+      " test-> 0.49012567324955114 \n",
+      " AUC: \n",
+      " train-> 0.524345344386498 \n",
+      " test-> 0.4437434827945777 \n",
+      "\n",
+      "Random Voter Model \n",
+      " Accuracy: \n",
+      " train-> 0.5024696901661428 \n",
+      " test-> 0.5008976660682226 \n",
+      " AUC: \n",
+      " train-> 0.5126309212678816 \n",
+      " test-> 0.5023114355231144 \n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "all_models = {'Majority Model':majority_model,\n",
+    "              'Label Model':label_model,\n",
+    "              'Random Voter Model':random_voter}\n",
+    "\n",
+    "for model_name,model in all_models.items():\n",
+    "    \n",
+    "    #accuracy\n",
+    "    train_acc = model.score(L=label_matrix_train, Y=y_train, tie_break_policy=\"random\")[\"accuracy\"]\n",
+    "    test_acc = model.score(L=label_matrix_test, Y=y_test, tie_break_policy=\"random\")[\"accuracy\"]\n",
+    "    \n",
+    "    #auc\n",
+    "    train_auc = metric_score(y_train, probs=model.predict_proba(L=label_matrix_train), metric='roc_auc')\n",
+    "    test_auc = metric_score(y_test, probs=model.predict_proba(L=label_matrix_test), metric='roc_auc')\n",
+    "    \n",
+    "    print(f'{model_name}','\\n',\n",
+    "          'Accuracy:','\\n','train->',train_acc,'\\n','test->',test_acc,'\\n',\n",
+    "          'AUC:','\\n','train->',train_auc,'\\n','test->',test_auc,'\\n')\n",
+    "    \n",
+    "    \n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The label model has the highest test AUC so that's the  best model."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 3. Filter out unlabeled points"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total points labelled in training data: 405\n",
+      "Total points labelled in testing data: 91\n"
+     ]
+    }
+   ],
+   "source": [
+    "#training labels\n",
+    "df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(\n",
+    "    X=df_train['text'], \n",
+    "    y=mm_preds_proba_train, \n",
+    "    L=label_matrix_train\n",
+    ")\n",
+    "\n",
+    "#testing labels\n",
+    "df_test_filtered, probs_test_filtered = filter_unlabeled_dataframe(\n",
+    "    X=df_test['text'], \n",
+    "    y=mm_preds_proba_test, \n",
+    "    L=label_matrix_test\n",
+    ")\n",
+    "\n",
+    "print('Total points labelled in training data:',len(df_train_filtered))\n",
+    "print('Total points labelled in testing data:',len(df_test_filtered))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# References\n",
+    "\n",
+    "https://www.snorkel.org/use-cases/spouse-demo\n",
+    "    \n",
+    "https://github.com/snorkel-team/snorkel-tutorials/blob/master/spouse/spouse_demo.ipynb\n",
+    "    \n",
+    "https://www.snorkel.org/use-cases/01-spam-tutorial\n",
+    "    \n",
+    "https://readthedocs.org/projects/snorkel/downloads/pdf/master/"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/immunology_kg/notebooks/snorkel_re_example/snorkel_preprocessing_example.py b/immunology_kg/notebooks/snorkel_re_example/snorkel_preprocessing_example.py
new file mode 100644
index 0000000..0183140
--- /dev/null
+++ b/immunology_kg/notebooks/snorkel_re_example/snorkel_preprocessing_example.py
@@ -0,0 +1,69 @@
+import snorkel
+
+from snorkel.preprocess import preprocessor
+from snorkel.preprocess.nlp import SpacyPreprocessor
+from snorkel.types import DataPoint
+
+spacy = SpacyPreprocessor(text_field="text", doc_field="doc", memoize=True)
+
+def make_source_target_preprocessor(spacy, sources, targets):
+    @preprocessor(pre=[spacy])
+    def get_source_target(cand: DataPoint) -> DataPoint:
+        """Returnsthe source and target mentioned in the sentence."""
+        person_names = []
+
+        source = [token.text for token in cand.doc if token.text in sources]
+        target = [token.text for token in cand.doc if token.text in targets]
+
+        try:
+            cand.source_target = (source[0], target[0])
+        except:
+            cand.source_target = (np.nan, np.nan)
+        return cand
+    return get_source_target
+
+def make_text_between_preprocessor(spacy, sources, targets):
+    @preprocessor(pre=[spacy])
+    def get_text_between(cand: DataPoint) -> DataPoint:
+        """
+        Returns the text between a source-target pair and the text to the left of the source
+        """
+
+        source_idx = [token.i for token in cand.doc if token.text in sources]
+        target_idx = [token.i for token in cand.doc if token.text in targets]
+
+        try:
+
+            if (len(target_idx)==1) & (len(source_idx)==1) & (source_idx[0]<target_idx[0]):
+                cand.text_between = cand.doc[source_idx[0]:target_idx[0]]
+                cand.text_to_source_left = cand.doc[:source_idx[0]]
+
+            elif (len(target_idx)>1) & (len(source_idx)==1):
+                for target_index in target_idx:
+                    if source_idx[0]<target_index:
+                        cand.text_between = cand.doc[source_idx[0]:target_index]
+                        cand.text_to_source_left = cand.doc[:source_idx[0]]
+
+            elif (len(source_idx)>1) & (len(target_idx)==1):
+                for source_index in source_idx:
+                    if source_index<target_idx[0]:
+                        cand.text_between = cand.doc[source_index:target_idx[0]]
+                        cand.text_to_source_left = cand.doc[:source_index]
+
+            elif (len(source_idx)>1) & (len(target_idx)>1):
+                for source_index in source_idx:
+                    for target_index in target_idx:
+                        if source_index<target_index:
+                            cand.text_between = cand.doc[source_index:target_index]
+                            cand.text_to_source_left = cand.doc[:source_index]
+
+            else:
+                cand.text_between = 'NaN'
+                cand.text_to_source_left = 'NaN'
+        except:
+
+            cand.text_between = 'NaN'
+            cand.text_to_source_left = 'NaN'
+
+        return cand
+    return get_text_between

	Unnamed: 0	text	source	relation	target	link	pmc_id	doi_id
0	0	While blocking TPC2 activity by tetrandrine, a...	{'(+)-Tetrandrine': {'namespace': 'chebi', 'na...	negativeCorrelation	{'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...	{'annotations': {}, 'citation': {'authors': ['...	32221306.0	NaN
1	1	Chemoinformatics searches yielded 15 approved ...	{'(S)-verapamil': {'namespace': 'chebi', 'name...	negativeCorrelation	{'hypertension': {'namespace': 'doid', 'name':...	{'annotations': {}, 'citation': {'db': 'DOI', ...	NaN	https://doi.org/10.1101/2020.03.22.002386
2	2	Thyroid stimulating hormone and free triiodoth...	{\"3,3',5'-triiodothyronine\": {'namespace': 'ch...	negativeCorrelation	{'COVID-19': {'namespace': 'doid', 'name': 'CO...	{'annotations': {'mesh': {'D044967': True}}, '...	32217556.0	NaN
3	3	Based on these results, we performed virtual d...	{\"4'-epidoxorubicin\": {'namespace': 'chebi', '...	decreases	{'3.4.22.69': {'namespace': 'eccode', 'name': ...	{'annotations': {}, 'citation': {'authors': ['...	32173287.0	NaN
4	4	Doctors can also use a clinically approved bil...	{'4-methylumbelliferone': {'namespace': 'chebi...	decreases	{'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',...	{'annotations': {'mesh': {'D008168': True}}, '...	32205856.0	NaN
5	5	Since Vitamin B3 is highly lung protective, it...	{'4-methylumbelliferone': {'namespace': 'chebi...	decreases	{'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',...	{'annotations': {}, 'citation': {'authors': ['...	32205856.0	NaN
6	6	Doctors can also use a clinically approved bil...	{'4-methylumbelliferone': {'namespace': 'chebi...	decreases	{'inflammatory response': {'namespace': 'go', ...	{'annotations': {'mesh': {'D008168': True}}, '...	32205856.0	NaN
	text	source	relation	target	link	pmc_id	doi_id
0	While blocking TPC2 activity by tetrandrine, a...	{'(+)-Tetrandrine': {'namespace': 'chebi', 'na...	True	{'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...	{'annotations': {}, 'citation': {'authors': ['...	32221306.0	NaN
1	Chemoinformatics searches yielded 15 approved ...	{'(S)-verapamil': {'namespace': 'chebi', 'name...	True	{'hypertension': {'namespace': 'doid', 'name':...	{'annotations': {}, 'citation': {'db': 'DOI', ...	NaN	https://doi.org/10.1101/2020.03.22.002386
2	Thyroid stimulating hormone and free triiodoth...	{\"3,3',5'-triiodothyronine\": {'namespace': 'ch...	True	{'COVID-19': {'namespace': 'doid', 'name': 'CO...	{'annotations': {'mesh': {'D044967': True}}, '...	32217556.0	NaN
3	The administration of methylprednisolone appea...	{'6-methylprednisolone': {'namespace': 'chebi'...	True	{'Death': {'namespace': 'mesh', 'name': 'Death...	{'annotations': {'doid': {'11394': True}}, 'ci...	32167524.0	NaN
4	Adverse reactions of IFN-α mainly include low-...	{'Interferon alfa-2a': {'namespace': 'chebi', ...	False	{'Low-grade fever': {'namespace': 'hp', 'name'...	{'annotations': {}, 'citation': {'authors': ['...	32166483.0	NaN
	j	Polarity	Coverage	Overlaps	Conflicts
contains_reduction_tokens	0	[1]	0.057027	0.019308	0.016165
contains_reduction_tokens_text_between	1	[1]	0.004041	0.004041	0.000898
contains_negative_corrrelation_regex	2	[1]	0.008532	0.000000	0.000000
contains_increase_decrease_pattern	3	[]	0.000000	0.000000	0.000000
contains_increase_tokens	4	[0]	0.130220	0.016165	0.016165
contains_increase_tokens_text_between	5	[]	0.000000	0.000000	0.000000
contains_positive_corrrelation_regex	6	[1]	0.002245	0.000000	0.000000
contains_increase_increase_pattern	7	[]	0.000000	0.000000	0.000000