diff --git a/PDF_KG_Neo.ipynb b/PDF_KG_Neo.ipynb new file mode 100644 index 000000000..fbb681cd1 --- /dev/null +++ b/PDF_KG_Neo.ipynb @@ -0,0 +1,344 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "6MPwmWPzdt5n", + "outputId": "6bf14b01-a343-4c75-e86b-30810c634082" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m802.4/802.4 kB\u001b[0m \u001b[31m5.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m165.7/165.7 kB\u001b[0m \u001b[31m10.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m197.8/197.8 kB\u001b[0m \u001b[31m10.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Installing backend dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m64.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m218.9/218.9 kB\u001b[0m \u001b[31m26.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.2/49.2 kB\u001b[0m \u001b[31m6.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m222.3/222.3 kB\u001b[0m \u001b[31m24.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m62.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.4/49.4 kB\u001b[0m \u001b[31m6.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.9/75.9 kB\u001b[0m \u001b[31m9.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.9/76.9 kB\u001b[0m \u001b[31m7.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for neo4j (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "llmx 0.0.15a0 requires cohere, which is not installed.\n", + "tensorflow-probability 0.22.0 requires typing-extensions<4.6.0, but you have typing-extensions 4.9.0 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "%pip install --upgrade --quiet langchain langchain-experimental langchain-openai neo4j" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KKBW-M3xdBd0", + "outputId": "89bcc07f-7636-41db-fb21-6846051844c6" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting pypdf\n", + " Downloading pypdf-3.17.4-py3-none-any.whl (278 kB)\n", + "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/278.2 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m112.6/278.2 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m278.2/278.2 kB\u001b[0m \u001b[31m5.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: pypdf\n", + "Successfully installed pypdf-3.17.4\n" + ] + } + ], + "source": [ + "%pip install pypdf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "M0ewcIJCduti" + }, + "outputs": [], + "source": [ + "import os\n", + "from google.colab import userdata\n", + "#Add openAI API key\n", + "os.environ['OPENAI_API_KEY']=userdata.get('OPENAI_API_KEY')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "fxIY0RgWeEiC" + }, + "outputs": [], + "source": [ + "from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer\n", + "#Add diffbot API key\n", + "diffbot_api_key = userdata.get('diffbot_api_key')\n", + "diffbot_nlp = DiffbotGraphTransformer(diffbot_api_key=diffbot_api_key)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "f92uFBSQe1BR" + }, + "outputs": [], + "source": [ + "from langchain_community.graphs import Neo4jGraph\n", + "url =userdata.get('neo_url')\n", + "username = userdata.get('neo_username')\n", + "password = userdata.get('neo_pwd')\n", + "graph = Neo4jGraph(url=url, username=username, password=password)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Coc7R5s2jZDP", + "outputId": "3fea9ac2-c9ee-475e-d56f-01b4155d0437" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2024-01-18 07:55:35-- https://www.yourdatateacher.com/wp-content/uploads/2021/05/Supervised-machine-learning-workflow.pdf\n", + "Resolving www.yourdatateacher.com (www.yourdatateacher.com)... 89.46.109.71\n", + "Connecting to www.yourdatateacher.com (www.yourdatateacher.com)|89.46.109.71|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 372942 (364K) [application/pdf]\n", + "Saving to: ‘Supervised-machine-learning-workflow.pdf’\n", + "\n", + "Supervised-machine- 100%[===================>] 364.20K 1.16MB/s in 0.3s \n", + "\n", + "2024-01-18 07:55:36 (1.16 MB/s) - ‘Supervised-machine-learning-workflow.pdf’ saved [372942/372942]\n", + "\n" + ] + } + ], + "source": [ + "!wget -nc 'https://www.yourdatateacher.com/wp-content/uploads/2021/05/Supervised-machine-learning-workflow.pdf'" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "4ADc4MozhFn9" + }, + "outputs": [], + "source": [ + "from langchain_community.document_loaders import PyPDFLoader\n", + "\n", + "loader = PyPDFLoader(\"/content/Supervised-machine-learning-workflow.pdf\")\n", + "\n", + "pages = loader.load_and_split()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "binODXa2YEGr" + }, + "outputs": [], + "source": [ + "\n", + "from langchain.docstore.document import Document\n", + "for i in range(0,len(pages)):\n", + " # pages[i].page_content.replace('\\n',' ')\n", + " pages[i]=Document(page_content=pages[i].page_content.replace('\\n',' '), metadata={\"source\": \"local\"})\n", + "# doc = Document(page_content=\"text\", metadata={\"source\": \"local\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "id": "IaLxe00ZeL-y" + }, + "outputs": [], + "source": [ + "graph_documents = diffbot_nlp.convert_to_graph_documents(pages)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "id": "hQ0PBgcxeN1h" + }, + "outputs": [], + "source": [ + "graph.add_graph_documents(graph_documents)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "LRqqayPYf8Fp" + }, + "outputs": [], + "source": [ + "graph.refresh_schema()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Vg8opMfetBjJ", + "outputId": "872eadc5-66fc-4e47-a924-66c2f85908b1" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='The purpose of this e-book In this e-book, I propose the typical Supervised Machine Learning workflow that data scientists follow when they need to create a supervised model. This workflow may change from project to project, but the steps I’m going to show you in the next sections are the most common steps that are required. For each step of the workflow, I’ll give you a brief introduction and I’ll suggest one or more Python libraries that may be used in a project. www.yourdatateacher.com 2', metadata={'source': 'local'}),\n", + " Document(page_content='Step 1: Data extraction The first step of a machine learning model is related to data. Any supervised model learns from data we feed it with. Data can be extracted from a Data Lake, a Data Warehouse, some Excel and CSV files, an SQL database or other sources. The purpose of this phase is to collect data and build the dataset we are going to work with. The dataset is a huge table made of several columns (sometimes hundreds of columns) and thousands of rows. Each column is called “feature” and they are needed to predict the “target” column. Python libraries: pandas, numpy Step 2: Exploratory Data Analysis After we have gathered our data, we must take a look at it . It’s the purpose of the Exploratory Data Analysis (EDA). In this phase, we use graphical representations of our dataset in order to discover the most important variables, the correlations, the orders of magnitude, and the statistical properties of the features with respect to the target. The purpose of EDA is to extract information from our dataset to better understand the phenomena inside our data and to start figuring out which features are useful and which others are useless. Python libraries: pandas, numpy, scipy, seaborn, matplotlib www.yourdatateacher.com 3', metadata={'source': 'local'}),\n", + " Document(page_content='Step 3: Data cleaning Once we have defined our dataset, we need to clean it filling the missing values in the features. This procedure is called “cleaning” and it’s very important because not every model is able to handle missing data, so we need to fill the blanks in some way. Python libraries: scikit-learn Step 4: Encoding If our variables are categorical (i.e. not numerical), some models may not work properly with them. The majority of the models, in fact, can only handle numerical data. So, we need to convert categorical features to numerical features. This procedure is called “encoding”. Python libraries: scikit-learn Step 5: T ransforming Some models require particular transformations to be applied to a dataset before it can be used. For example, scaling the features to the same order of magnitude, symmetrize their probability distributions and other types of transformations. These numerical transformations are necessary to make our model work properly and they change according to the model. Python libraries: scikit-learn, imblearn, scipy www.yourdatateacher.com 4', metadata={'source': 'local'}),\n", + " Document(page_content='Step 6: Dimensionality reduction After we have transformed the features, a good choice is to reduce the dimensionality of our dataset removing the useless features. A first reduction has been made in the EDA phase, but a new reduction can be done as well if our dataset is still large and the features are still correlated with each other . Python libraries: scikit-learn Step 7: Model selection We can now select a supervised model that is able to generalize the training dataset and make accurate predictions even on datasets it has not been trained on. Every model has its own needs about the encoding of the categorical features and the transformations to be applied, so when we perform this research, we must keep in mind such requirements . Python libraries: scikit-learn www.yourdatateacher.com 5', metadata={'source': 'local'}),\n", + " Document(page_content='Step 8: Hyperparameter tuning Once the model has been selected, we can fine-tune its hyperparameters , which are some parameters whose values are set before the training phase. Hyperparameter tuning is necessary in order to make a model that better generalizes the training dataset. Python libraries: scikit-learn Step 9: Recursive Feature Elimination Once the model has been created and its hyperparameters have been optimized, we can use it for further feature selection and dimensionality reduction . It’s not mandatory, but it can be helpful . One common choice is to use a procedure called Recursive Feature Elimination. Python libraries: scikit-learn Step 10: Feature importance and model interpretation Finally, we must interpret and explain our model and calculate the importance of the features in order to catch and understand the information behind data and explain the phenomena that created our dataset. This is the last part of a machine learning workflow, but it’s probably the most important one . Understanding how our model works www.yourdatateacher.com 6', metadata={'source': 'local'}),\n", + " Document(page_content='and how information flows through data is the core of every data science project. Python libraries: scikit-learn, shap www.yourdatateacher.com 7', metadata={'source': 'local'}),\n", + " Document(page_content='Suggested courses For the data manipulation and transformation process, I suggest attending my Data pre-processing for Machine Learning in Python online course. For the model selection , the hyperparameter tuning, the recursive feature elimination and the calculation of feature importance , I suggest attending my Supervised Machine Learning in Python online course. If you need a custom training program, you can benefit from my one-to-one coaching program. Just send me a message and we’ll talk about building a training program made by remote video lessons. www.yourdatateacher.com 8', metadata={'source': 'local'}),\n", + " Document(page_content='Who am I? My name is Gianluca Malato , I’m Italian and have a Master’s Degree cum laude in Theoretical Physics of disordered systems at “La Sapienza” University of Rome. I’m a Data Scientist who has been working for years in the banking and insurance sector. I have extensive experience in software programming and project management and I have been dealing with data analysis and machine learning in the corporate environment for several years. I’ve written many articles about Machine Learning, R and Python and I’ve been a Top Writer on Medium.com in Artificial Intelligence category. I teach Data Science on YourDataTeacher.com My e-mail address is gianluca@yourdatateacher.com www.yourdatateacher.com 9', metadata={'source': 'local'}),\n", + " Document(page_content='Table of contents The purpose of this e-book 2 Step 1: Data extraction 3 Step 2: Exploratory Data Analysis 3 Step 3: Data cleaning 4 Step 4: Encoding 4 Step 5: Transforming 4 Step 6: Dimensionality reduction 5 Step 7: Model selection 5 Step 8: Hyperparameter tuning 6 Step 9: Recursive Feature Elimination 6 Step 10: Feature importance and model interpretation 6 Suggested courses 8 Who am I? 9 Table of contents 10 www.yourdatateacher.com 10', metadata={'source': 'local'})]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pages" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "id": "eS0HFed8gCCc" + }, + "outputs": [], + "source": [ + "from langchain.chains import GraphCypherQAChain\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "chain = GraphCypherQAChain.from_llm(\n", + " cypher_llm=ChatOpenAI(temperature=0, model_name=\"gpt-4\"),\n", + " qa_llm=ChatOpenAI(temperature=0, model_name=\"gpt-3.5-turbo\"),\n", + " graph=graph,\n", + " verbose=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 246 + }, + "id": "vLO7ExAegP30", + "outputId": "0928dd67-e0f2-47d6-e4b8-890240d87a6e" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/langchain_core/_api/deprecation.py:117: LangChainDeprecationWarning: The function `run` was deprecated in LangChain 0.1.0 and will be removed in 0.2.0. Use invoke instead.\n", + " warn_deprecated(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n", + "Generated Cypher:\n", + "\u001b[32;1m\u001b[1;3mMATCH (p:Person {name: \"Gianluca Malato\"}) RETURN p\u001b[0m\n", + "Full Context:\n", + "\u001b[32;1m\u001b[1;3m[{'p': {'name': 'Gianluca Malato', 'id': 'http://www.linkedin.com/in/gianlucamalato', 'positionHeld': 'Data Scientist'}}]\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] + }, + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'Gianluca Malato is a Data Scientist.'" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# chain.run(\"What is machine learning\")\n", + "chain.run(\"Who is Gianluca Malato?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "id": "UQHW_I1xkPzr" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}