From a07b0d3040b4fb1c29034e8117753217d6b93dff Mon Sep 17 00:00:00 2001
From: Abdellahitech <abdellahi.mezid@hrflow.ai>
Date: Tue, 29 Oct 2024 02:02:22 +0100
Subject: [PATCH] feat : new scoring evaluation notebook

---
 .../colab/scoring_evaluation_notebook.ipynb   | 418 ++++++++++++++++++
 1 file changed, 418 insertions(+)
 create mode 100644 examples/colab/scoring_evaluation_notebook.ipynb

diff --git a/examples/colab/scoring_evaluation_notebook.ipynb b/examples/colab/scoring_evaluation_notebook.ipynb
new file mode 100644
index 0000000..7c83574
--- /dev/null
+++ b/examples/colab/scoring_evaluation_notebook.ipynb
@@ -0,0 +1,418 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### Copyright 2020 HrFlow's AI Research Department\n",
+    "\n",
+    "Licensed under the Apache License, Version 2.0 (the \"License\");"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Copyright 2020 HrFlow's AI Research Department. All Rights Reserved.\n",
+    "#\n",
+    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "#     http://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License.\n",
+    "# =============================================================================="
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Overview\n",
+    "This notebook is made to evaluate the scoring of profiles regarding a specific job or jobs regarding specific profile\n",
+    "This notebook will be structured as follow:\n",
+    "* General functions:\n",
+    "    * Get all items\n",
+    "    * Get scoring results\n",
+    "    * Tag item\n",
+    "* Score profiles for a specific job\n",
+    "* Score jobs for a specific profile"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "import pandas as pd\n",
+    "import json\n",
+    "import os\n",
+    "from datetime import datetime\n",
+    "from tqdm import tqdm\n",
+    "from hrflow import Hrflow\n",
+    "from dotenv import load_dotenv"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "load_dotenv()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "API_SECRET = os.getenv(\"API_SECRET\")\n",
+    "API_USER = os.getenv(\"API_USER\")\n",
+    "ALGORITHM = os.getenv(\"ALGORITHM\")\n",
+    "BOARD_KEY = os.getenv(\"BOARD_KEY\")\n",
+    "BOARD_KEYS = os.getenv(\"BOARD_KEYS\")\n",
+    "SOURCE_KEY = os.getenv(\"SOURCE_KEY\")\n",
+    "SOURCE_KEYS = os.getenv(\"SOURCE_KEYS\")\n",
+    "OUTPUT_FILE = os.getenv(\"OUTPUT_FILE\")\n",
+    "LIMIT_SCORING = \"32\"\n",
+    "LIMIT_SEARCHING = \"10000\"\n",
+    "ALGORITHM_FAMILY = \"tagger-rome4-family\"\n",
+    "ALGORITHM_SUBFAMILY = \"tagger-rome4-subfamily\"\n",
+    "ALGORITHM_CATEGORY = \"tagger-rome4-category\"\n",
+    "ALGORITHM_JOB_TITLE = \"tagger-rome4-jobtitle\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "SOURCE_KEYS=json.loads(SOURCE_KEYS)\n",
+    "BOARD_KEYS=json.loads(BOARD_KEYS)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client = Hrflow(api_secret=API_SECRET,api_user=API_USER)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Function to score item based on source or board of items\n",
+    "def get_scoring_items(client, item, source_keys=None, board_keys=None):\n",
+    "    if source_keys:\n",
+    "        response_scoring = client.profile.scoring.list(\n",
+    "            job_key=item[\"key\"],\n",
+    "            board_key=BOARD_KEY,\n",
+    "            source_keys=source_keys,\n",
+    "            limit=LIMIT_SCORING,\n",
+    "            agent_key=ALGORITHM\n",
+    "        )\n",
+    "    else:\n",
+    "        assert board_keys is not None\n",
+    "        response_scoring = client.job.scoring.list(\n",
+    "            profile_key=item[\"key\"],\n",
+    "            source_key=SOURCE_KEY,\n",
+    "            board_keys=board_keys,\n",
+    "            limit=LIMIT_SCORING,\n",
+    "            agent_key=ALGORITHM\n",
+    "        )\n",
+    "    \n",
+    "    if response_scoring[\"code\"] != 200:\n",
+    "        print(\"error while returning scoring:\", response_scoring)\n",
+    "        return\n",
+    "    \n",
+    "    scored_items = response_scoring[\"data\"][\"profiles\"] if source_keys else response_scoring[\"data\"][\"jobs\"]\n",
+    "\n",
+    "    scores = [prediction[1] for prediction in response_scoring[\"data\"][\"predictions\"]]\n",
+    "    \n",
+    "    return item, scored_items, scores\n",
+    "\n",
+    "# get items is sources or boards\n",
+    "def get_items_searching(\n",
+    "    client,source_keys=None,board_keys=None\n",
+    "):\n",
+    "    if source_keys:\n",
+    "        response_searching = client.profile.searching.list(\n",
+    "            source_keys=source_keys,\n",
+    "            limit=LIMIT_SEARCHING,\n",
+    "            order_by=\"desc\"\n",
+    "        )\n",
+    "    else:\n",
+    "        assert board_keys is not None\n",
+    "        response_searching = client.job.searching.list(\n",
+    "            board_keys=board_keys,\n",
+    "            limit=LIMIT_SEARCHING,\n",
+    "            order_by=\"desc\"\n",
+    "        )\n",
+    "    \n",
+    "    if response_searching[\"code\"] != 200:\n",
+    "        print(\"error while returning searching:\", response_searching)\n",
+    "        return\n",
+    "    \n",
+    "    searched_items = response_searching[\"data\"][\"profiles\"] if source_keys else response_searching[\"data\"][\"jobs\"]\n",
+    "    return searched_items\n",
+    "\n",
+    "## function to tag items\n",
+    "def tagger_romev4(text, algorithm):\n",
+    "    url = \"https://api.hrflow.ai/v1/text/tagging\"\n",
+    "\n",
+    "    payload = {\n",
+    "        \"algorithm_key\": algorithm,\n",
+    "        \"texts\": [text],\n",
+    "        \"top_n\": 1,\n",
+    "    }\n",
+    "    headers = {\n",
+    "        \"accept\": \"application/json\",\n",
+    "        \"content-type\": \"application/json\",\n",
+    "        \"X-API-KEY\": API_SECRET,\n",
+    "        \"X-USER-EMAIL\": API_USER,\n",
+    "    }\n",
+    "\n",
+    "    response = requests.post(url, json=payload, headers=headers)\n",
+    "    if response.status_code != 200:\n",
+    "        print(f\"HTTP error: {response.text}\")\n",
+    "        return None\n",
+    "\n",
+    "    response_data = response.json()\n",
+    "    \n",
+    "    data = response_data.get(\"data\")\n",
+    "    if data and isinstance(data[0], dict):\n",
+    "        tags = data[0].get(\"tags\")\n",
+    "        if tags and isinstance(tags, list):\n",
+    "            return tags[0] if tags else None\n",
+    "        \n",
+    "    return None\n",
+    "\n",
+    "def format_date(date_str: str) -> str:\n",
+    "    if date_str:\n",
+    "        return datetime.strptime(date_str, \"%Y-%m-%dT%H:%M:%S+0000\").strftime(\"%Y-%m-%d\")\n",
+    "    return None\n",
+    "\n",
+    "def categorize_scores(scores):\n",
+    "    star_count = {\n",
+    "        \"5 stars\": sum(0.8 <= score <= 1 for score in scores),\n",
+    "        \"4 stars\": sum(0.6 <= score < 0.8 for score in scores),\n",
+    "        \"3 stars\": sum(0.4 <= score < 0.6 for score in scores),\n",
+    "        \"2 stars\": sum(0.2 <= score < 0.4 for score in scores),\n",
+    "        \"1 star\": sum(0 <= score < 0.2 for score in scores),\n",
+    "    }\n",
+    "    return star_count\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Score profiles for a specific job"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "jobs = get_items_searching(client,board_keys=BOARD_KEYS)\n",
+    "for job in tqdm(jobs):\n",
+    "    sections_desc = \"\\n\".join([section[\"description\"] for section in job[\"sections\"]])\n",
+    "    \n",
+    "    tags_data = {\n",
+    "        \"family\": tagger_romev4(sections_desc, ALGORITHM_FAMILY),\n",
+    "        \"subfamily\": tagger_romev4(sections_desc, ALGORITHM_SUBFAMILY),\n",
+    "        \"category\": tagger_romev4(sections_desc, ALGORITHM_CATEGORY),\n",
+    "        \"job_title\": tagger_romev4(sections_desc, ALGORITHM_JOB_TITLE),\n",
+    "    }\n",
+    "\n",
+    "    job[\"tags\"].extend(\n",
+    "        [{\"name\": f\"hrflow_tag_romev4_{key}\", \"value\": value} for key, value in tags_data.items() if value]\n",
+    "    )\n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "scoring_job_result = []\n",
+    "for job in tqdm(jobs):\n",
+    "    scoring_result = get_scoring_items(client,job,source_keys=SOURCE_KEYS)\n",
+    "    scoring_job_result.append(scoring_result)\n",
+    "\n",
+    "rows = []\n",
+    "for job, _, scores in scoring_job_result:\n",
+    "    star_count = categorize_scores(scores)\n",
+    "    tags = job[\"tags\"]\n",
+    "\n",
+    "    sections_desc = \"\\n\".join([section[\"description\"] for section in job[\"sections\"]])\n",
+    "    len_offres = len(sections_desc)\n",
+    "    \n",
+    "    family = None\n",
+    "    subfamily = None\n",
+    "    category = None\n",
+    "    job_title = None\n",
+    "        \n",
+    "    for tag in tags:\n",
+    "        if tag[\"name\"] == \"hrflow_tag_romev4_family\":\n",
+    "            family = tag[\"value\"]\n",
+    "        if tag[\"name\"] == \"hrflow_tag_romev4_subfamily\":\n",
+    "            subfamily = tag[\"value\"]\n",
+    "        if tag[\"name\"] == \"hrflow_tag_romev4_category\":\n",
+    "            category = tag[\"value\"]\n",
+    "        if tag[\"name\"] == \"hrflow_tag_romev4_job_title\":\n",
+    "            job_title = tag[\"value\"]\n",
+    "    \n",
+    "    rows.append({\n",
+    "        \"Nom\": job[\"name\"],\n",
+    "        \"Reference\": job[\"reference\"],\n",
+    "        \"Date de création\" : format_date(job[\"created_at\"]),\n",
+    "        \"Localisation\" : job[\"location\"][\"text\"],\n",
+    "        \"Nombre de caractères de l'offre\": len_offres ,\n",
+    "        \"Nombre de profils ayant 5 étoiles\": star_count[\"5 stars\"],\n",
+    "        \"Nombre de profils ayant 4 étoiles\": star_count[\"4 stars\"],\n",
+    "        \"Nombre de profils ayant 3 étoiles\": star_count[\"3 stars\"],\n",
+    "        \"Nombre de profils ayant 2 étoiles\": star_count[\"2 stars\"],\n",
+    "        \"Nombre de profils ayant 1 étoiles\": star_count[\"1 star\"],\n",
+    "        \"Tagger romev4 family\": family,\n",
+    "        \"Tagger romev4 subfamily\": subfamily,\n",
+    "        \"Tagger romev4 category\": category,\n",
+    "        \"Tagger romev4 job title\":  job_title,\n",
+    "    })\n",
+    "\n",
+    "df = pd.DataFrame(rows)\n",
+    "\n",
+    "\n",
+    "df.to_excel(OUTPUT_FILE, index=False)\n",
+    "\n",
+    "print(f\"Excel file '{OUTPUT_FILE}' generated successfully.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Score jobs for a specific profile"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "profiles = get_items_searching(client,source_keys=SOURCE_KEYS)\n",
+    "for profile in tqdm(profiles):\n",
+    "    tags_data = {\n",
+    "        \"family\": tagger_romev4(profile[\"text\"], ALGORITHM_FAMILY),\n",
+    "        \"subfamily\": tagger_romev4(profile[\"text\"], ALGORITHM_SUBFAMILY),\n",
+    "        \"category\": tagger_romev4(profile[\"text\"], ALGORITHM_CATEGORY),\n",
+    "        \"job_title\": tagger_romev4(profile[\"text\"], ALGORITHM_JOB_TITLE),\n",
+    "    }\n",
+    "\n",
+    "    profile[\"tags\"].extend(\n",
+    "        [{\"name\": f\"hrflow_tag_romev4_{key}\", \"value\": value} for key, value in tags_data.items() if value]\n",
+    "    )\n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "scoring_profile_result = []\n",
+    "for profile in tqdm(profiles):\n",
+    "    scoring_result = get_scoring_items(client,profile,board_keys=BOARD_KEYS)\n",
+    "    scoring_profile_result.append(scoring_result)\n",
+    "\n",
+    "rows = []\n",
+    "for profile, _, scores in scoring_profile_result:\n",
+    "    star_count = categorize_scores(scores)\n",
+    "    tags = profile[\"tags\"]\n",
+    "\n",
+    "    family = None\n",
+    "    subfamily = None\n",
+    "    category = None\n",
+    "    job_title = None\n",
+    "        \n",
+    "    for tag in tags:\n",
+    "        if tag[\"name\"] == \"hrflow_tag_romev4_family\":\n",
+    "            family = tag[\"value\"]\n",
+    "        if tag[\"name\"] == \"hrflow_tag_romev4_subfamily\":\n",
+    "            subfamily = tag[\"value\"]\n",
+    "        if tag[\"name\"] == \"hrflow_tag_romev4_category\":\n",
+    "            category = tag[\"value\"]\n",
+    "        if tag[\"name\"] == \"hrflow_tag_romev4_job_title\":\n",
+    "            job_title = tag[\"value\"]\n",
+    "    \n",
+    "    rows.append({\n",
+    "        \"Nom\": profile[\"name\"],\n",
+    "        \"Reference\": profile[\"reference\"],\n",
+    "        \"Date de reception\" : format_date(profile[\"created_at\"]),\n",
+    "        \"Localisation\" : profile[\"location\"][\"text\"],\n",
+    "        \"Nombre de caractères de l'offre\": len_offres ,\n",
+    "        \"Nombre de jobs ayant 5 étoiles\": star_count[\"5 stars\"],\n",
+    "        \"Nombre de jobs ayant 4 étoiles\": star_count[\"4 stars\"],\n",
+    "        \"Nombre de jobs ayant 3 étoiles\": star_count[\"3 stars\"],\n",
+    "        \"Nombre de jobs ayant 2 étoiles\": star_count[\"2 stars\"],\n",
+    "        \"Nombre de jobs ayant 1 étoiles\": star_count[\"1 star\"],\n",
+    "        \"Tagger romev4 family\": family,\n",
+    "        \"Tagger romev4 subfamily\": subfamily,\n",
+    "        \"Tagger romev4 category\": category,\n",
+    "        \"Tagger romev4 job title\":  job_title,\n",
+    "    })\n",
+    "\n",
+    "df = pd.DataFrame(rows)\n",
+    "\n",
+    "\n",
+    "df.to_excel(OUTPUT_FILE, index=False)\n",
+    "\n",
+    "print(f\"Excel file '{OUTPUT_FILE}' generated successfully.\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "customers_env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}