From a07b0d3040b4fb1c29034e8117753217d6b93dff Mon Sep 17 00:00:00 2001 From: Abdellahitech Date: Tue, 29 Oct 2024 02:02:22 +0100 Subject: [PATCH] feat : new scoring evaluation notebook --- .../colab/scoring_evaluation_notebook.ipynb | 418 ++++++++++++++++++ 1 file changed, 418 insertions(+) create mode 100644 examples/colab/scoring_evaluation_notebook.ipynb diff --git a/examples/colab/scoring_evaluation_notebook.ipynb b/examples/colab/scoring_evaluation_notebook.ipynb new file mode 100644 index 0000000..7c83574 --- /dev/null +++ b/examples/colab/scoring_evaluation_notebook.ipynb @@ -0,0 +1,418 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Copyright 2020 HrFlow's AI Research Department\n", + "\n", + "Licensed under the Apache License, Version 2.0 (the \"License\");" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2020 HrFlow's AI Research Department. All Rights Reserved.\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License.\n", + "# ==============================================================================" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Overview\n", + "This notebook is made to evaluate the scoring of profiles regarding a specific job or jobs regarding specific profile\n", + "This notebook will be structured as follow:\n", + "* General functions:\n", + " * Get all items\n", + " * Get scoring results\n", + " * Tag item\n", + "* Score profiles for a specific job\n", + "* Score jobs for a specific profile" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "import pandas as pd\n", + "import json\n", + "import os\n", + "from datetime import datetime\n", + "from tqdm import tqdm\n", + "from hrflow import Hrflow\n", + "from dotenv import load_dotenv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "load_dotenv()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "API_SECRET = os.getenv(\"API_SECRET\")\n", + "API_USER = os.getenv(\"API_USER\")\n", + "ALGORITHM = os.getenv(\"ALGORITHM\")\n", + "BOARD_KEY = os.getenv(\"BOARD_KEY\")\n", + "BOARD_KEYS = os.getenv(\"BOARD_KEYS\")\n", + "SOURCE_KEY = os.getenv(\"SOURCE_KEY\")\n", + "SOURCE_KEYS = os.getenv(\"SOURCE_KEYS\")\n", + "OUTPUT_FILE = os.getenv(\"OUTPUT_FILE\")\n", + "LIMIT_SCORING = \"32\"\n", + "LIMIT_SEARCHING = \"10000\"\n", + "ALGORITHM_FAMILY = \"tagger-rome4-family\"\n", + "ALGORITHM_SUBFAMILY = \"tagger-rome4-subfamily\"\n", + "ALGORITHM_CATEGORY = \"tagger-rome4-category\"\n", + "ALGORITHM_JOB_TITLE = \"tagger-rome4-jobtitle\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "SOURCE_KEYS=json.loads(SOURCE_KEYS)\n", + "BOARD_KEYS=json.loads(BOARD_KEYS)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client = Hrflow(api_secret=API_SECRET,api_user=API_USER)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## Function to score item based on source or board of items\n", + "def get_scoring_items(client, item, source_keys=None, board_keys=None):\n", + " if source_keys:\n", + " response_scoring = client.profile.scoring.list(\n", + " job_key=item[\"key\"],\n", + " board_key=BOARD_KEY,\n", + " source_keys=source_keys,\n", + " limit=LIMIT_SCORING,\n", + " agent_key=ALGORITHM\n", + " )\n", + " else:\n", + " assert board_keys is not None\n", + " response_scoring = client.job.scoring.list(\n", + " profile_key=item[\"key\"],\n", + " source_key=SOURCE_KEY,\n", + " board_keys=board_keys,\n", + " limit=LIMIT_SCORING,\n", + " agent_key=ALGORITHM\n", + " )\n", + " \n", + " if response_scoring[\"code\"] != 200:\n", + " print(\"error while returning scoring:\", response_scoring)\n", + " return\n", + " \n", + " scored_items = response_scoring[\"data\"][\"profiles\"] if source_keys else response_scoring[\"data\"][\"jobs\"]\n", + "\n", + " scores = [prediction[1] for prediction in response_scoring[\"data\"][\"predictions\"]]\n", + " \n", + " return item, scored_items, scores\n", + "\n", + "# get items is sources or boards\n", + "def get_items_searching(\n", + " client,source_keys=None,board_keys=None\n", + "):\n", + " if source_keys:\n", + " response_searching = client.profile.searching.list(\n", + " source_keys=source_keys,\n", + " limit=LIMIT_SEARCHING,\n", + " order_by=\"desc\"\n", + " )\n", + " else:\n", + " assert board_keys is not None\n", + " response_searching = client.job.searching.list(\n", + " board_keys=board_keys,\n", + " limit=LIMIT_SEARCHING,\n", + " order_by=\"desc\"\n", + " )\n", + " \n", + " if response_searching[\"code\"] != 200:\n", + " print(\"error while returning searching:\", response_searching)\n", + " return\n", + " \n", + " searched_items = response_searching[\"data\"][\"profiles\"] if source_keys else response_searching[\"data\"][\"jobs\"]\n", + " return searched_items\n", + "\n", + "## function to tag items\n", + "def tagger_romev4(text, algorithm):\n", + " url = \"https://api.hrflow.ai/v1/text/tagging\"\n", + "\n", + " payload = {\n", + " \"algorithm_key\": algorithm,\n", + " \"texts\": [text],\n", + " \"top_n\": 1,\n", + " }\n", + " headers = {\n", + " \"accept\": \"application/json\",\n", + " \"content-type\": \"application/json\",\n", + " \"X-API-KEY\": API_SECRET,\n", + " \"X-USER-EMAIL\": API_USER,\n", + " }\n", + "\n", + " response = requests.post(url, json=payload, headers=headers)\n", + " if response.status_code != 200:\n", + " print(f\"HTTP error: {response.text}\")\n", + " return None\n", + "\n", + " response_data = response.json()\n", + " \n", + " data = response_data.get(\"data\")\n", + " if data and isinstance(data[0], dict):\n", + " tags = data[0].get(\"tags\")\n", + " if tags and isinstance(tags, list):\n", + " return tags[0] if tags else None\n", + " \n", + " return None\n", + "\n", + "def format_date(date_str: str) -> str:\n", + " if date_str:\n", + " return datetime.strptime(date_str, \"%Y-%m-%dT%H:%M:%S+0000\").strftime(\"%Y-%m-%d\")\n", + " return None\n", + "\n", + "def categorize_scores(scores):\n", + " star_count = {\n", + " \"5 stars\": sum(0.8 <= score <= 1 for score in scores),\n", + " \"4 stars\": sum(0.6 <= score < 0.8 for score in scores),\n", + " \"3 stars\": sum(0.4 <= score < 0.6 for score in scores),\n", + " \"2 stars\": sum(0.2 <= score < 0.4 for score in scores),\n", + " \"1 star\": sum(0 <= score < 0.2 for score in scores),\n", + " }\n", + " return star_count\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Score profiles for a specific job" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "jobs = get_items_searching(client,board_keys=BOARD_KEYS)\n", + "for job in tqdm(jobs):\n", + " sections_desc = \"\\n\".join([section[\"description\"] for section in job[\"sections\"]])\n", + " \n", + " tags_data = {\n", + " \"family\": tagger_romev4(sections_desc, ALGORITHM_FAMILY),\n", + " \"subfamily\": tagger_romev4(sections_desc, ALGORITHM_SUBFAMILY),\n", + " \"category\": tagger_romev4(sections_desc, ALGORITHM_CATEGORY),\n", + " \"job_title\": tagger_romev4(sections_desc, ALGORITHM_JOB_TITLE),\n", + " }\n", + "\n", + " job[\"tags\"].extend(\n", + " [{\"name\": f\"hrflow_tag_romev4_{key}\", \"value\": value} for key, value in tags_data.items() if value]\n", + " )\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "scoring_job_result = []\n", + "for job in tqdm(jobs):\n", + " scoring_result = get_scoring_items(client,job,source_keys=SOURCE_KEYS)\n", + " scoring_job_result.append(scoring_result)\n", + "\n", + "rows = []\n", + "for job, _, scores in scoring_job_result:\n", + " star_count = categorize_scores(scores)\n", + " tags = job[\"tags\"]\n", + "\n", + " sections_desc = \"\\n\".join([section[\"description\"] for section in job[\"sections\"]])\n", + " len_offres = len(sections_desc)\n", + " \n", + " family = None\n", + " subfamily = None\n", + " category = None\n", + " job_title = None\n", + " \n", + " for tag in tags:\n", + " if tag[\"name\"] == \"hrflow_tag_romev4_family\":\n", + " family = tag[\"value\"]\n", + " if tag[\"name\"] == \"hrflow_tag_romev4_subfamily\":\n", + " subfamily = tag[\"value\"]\n", + " if tag[\"name\"] == \"hrflow_tag_romev4_category\":\n", + " category = tag[\"value\"]\n", + " if tag[\"name\"] == \"hrflow_tag_romev4_job_title\":\n", + " job_title = tag[\"value\"]\n", + " \n", + " rows.append({\n", + " \"Nom\": job[\"name\"],\n", + " \"Reference\": job[\"reference\"],\n", + " \"Date de création\" : format_date(job[\"created_at\"]),\n", + " \"Localisation\" : job[\"location\"][\"text\"],\n", + " \"Nombre de caractères de l'offre\": len_offres ,\n", + " \"Nombre de profils ayant 5 étoiles\": star_count[\"5 stars\"],\n", + " \"Nombre de profils ayant 4 étoiles\": star_count[\"4 stars\"],\n", + " \"Nombre de profils ayant 3 étoiles\": star_count[\"3 stars\"],\n", + " \"Nombre de profils ayant 2 étoiles\": star_count[\"2 stars\"],\n", + " \"Nombre de profils ayant 1 étoiles\": star_count[\"1 star\"],\n", + " \"Tagger romev4 family\": family,\n", + " \"Tagger romev4 subfamily\": subfamily,\n", + " \"Tagger romev4 category\": category,\n", + " \"Tagger romev4 job title\": job_title,\n", + " })\n", + "\n", + "df = pd.DataFrame(rows)\n", + "\n", + "\n", + "df.to_excel(OUTPUT_FILE, index=False)\n", + "\n", + "print(f\"Excel file '{OUTPUT_FILE}' generated successfully.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Score jobs for a specific profile" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "profiles = get_items_searching(client,source_keys=SOURCE_KEYS)\n", + "for profile in tqdm(profiles):\n", + " tags_data = {\n", + " \"family\": tagger_romev4(profile[\"text\"], ALGORITHM_FAMILY),\n", + " \"subfamily\": tagger_romev4(profile[\"text\"], ALGORITHM_SUBFAMILY),\n", + " \"category\": tagger_romev4(profile[\"text\"], ALGORITHM_CATEGORY),\n", + " \"job_title\": tagger_romev4(profile[\"text\"], ALGORITHM_JOB_TITLE),\n", + " }\n", + "\n", + " profile[\"tags\"].extend(\n", + " [{\"name\": f\"hrflow_tag_romev4_{key}\", \"value\": value} for key, value in tags_data.items() if value]\n", + " )\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "scoring_profile_result = []\n", + "for profile in tqdm(profiles):\n", + " scoring_result = get_scoring_items(client,profile,board_keys=BOARD_KEYS)\n", + " scoring_profile_result.append(scoring_result)\n", + "\n", + "rows = []\n", + "for profile, _, scores in scoring_profile_result:\n", + " star_count = categorize_scores(scores)\n", + " tags = profile[\"tags\"]\n", + "\n", + " family = None\n", + " subfamily = None\n", + " category = None\n", + " job_title = None\n", + " \n", + " for tag in tags:\n", + " if tag[\"name\"] == \"hrflow_tag_romev4_family\":\n", + " family = tag[\"value\"]\n", + " if tag[\"name\"] == \"hrflow_tag_romev4_subfamily\":\n", + " subfamily = tag[\"value\"]\n", + " if tag[\"name\"] == \"hrflow_tag_romev4_category\":\n", + " category = tag[\"value\"]\n", + " if tag[\"name\"] == \"hrflow_tag_romev4_job_title\":\n", + " job_title = tag[\"value\"]\n", + " \n", + " rows.append({\n", + " \"Nom\": profile[\"name\"],\n", + " \"Reference\": profile[\"reference\"],\n", + " \"Date de reception\" : format_date(profile[\"created_at\"]),\n", + " \"Localisation\" : profile[\"location\"][\"text\"],\n", + " \"Nombre de caractères de l'offre\": len_offres ,\n", + " \"Nombre de jobs ayant 5 étoiles\": star_count[\"5 stars\"],\n", + " \"Nombre de jobs ayant 4 étoiles\": star_count[\"4 stars\"],\n", + " \"Nombre de jobs ayant 3 étoiles\": star_count[\"3 stars\"],\n", + " \"Nombre de jobs ayant 2 étoiles\": star_count[\"2 stars\"],\n", + " \"Nombre de jobs ayant 1 étoiles\": star_count[\"1 star\"],\n", + " \"Tagger romev4 family\": family,\n", + " \"Tagger romev4 subfamily\": subfamily,\n", + " \"Tagger romev4 category\": category,\n", + " \"Tagger romev4 job title\": job_title,\n", + " })\n", + "\n", + "df = pd.DataFrame(rows)\n", + "\n", + "\n", + "df.to_excel(OUTPUT_FILE, index=False)\n", + "\n", + "print(f\"Excel file '{OUTPUT_FILE}' generated successfully.\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "customers_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}