diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 000000000..13566b81b --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/O2DPG.iml b/.idea/O2DPG.iml new file mode 100644 index 000000000..701920c16 --- /dev/null +++ b/.idea/O2DPG.iml @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/copilot.data.migration.agent.xml b/.idea/copilot.data.migration.agent.xml new file mode 100644 index 000000000..4ea72a911 --- /dev/null +++ b/.idea/copilot.data.migration.agent.xml @@ -0,0 +1,6 @@ + + + + + \ No newline at end of file diff --git a/.idea/copilot.data.migration.ask.xml b/.idea/copilot.data.migration.ask.xml new file mode 100644 index 000000000..7ef04e2ea --- /dev/null +++ b/.idea/copilot.data.migration.ask.xml @@ -0,0 +1,6 @@ + + + + + \ No newline at end of file diff --git a/.idea/copilot.data.migration.ask2agent.xml b/.idea/copilot.data.migration.ask2agent.xml new file mode 100644 index 000000000..1f2ea11e7 --- /dev/null +++ b/.idea/copilot.data.migration.ask2agent.xml @@ -0,0 +1,6 @@ + + + + + \ No newline at end of file diff --git a/.idea/copilot.data.migration.edit.xml b/.idea/copilot.data.migration.edit.xml new file mode 100644 index 000000000..8648f9401 --- /dev/null +++ b/.idea/copilot.data.migration.edit.xml @@ -0,0 +1,6 @@ + + + + + \ No newline at end of file diff --git a/.idea/deployment.xml b/.idea/deployment.xml new file mode 100644 index 000000000..2b7ea8271 --- /dev/null +++ b/.idea/deployment.xml @@ -0,0 +1,28 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/editor.xml b/.idea/editor.xml new file mode 100644 index 000000000..f7de12eaa --- /dev/null +++ b/.idea/editor.xml @@ -0,0 +1,580 @@ + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 000000000..564cff9c8 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,10 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 000000000..94ebdc00b --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,6 @@ + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 000000000..317c05781 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 000000000..35eb1ddfb --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/webServers.xml b/.idea/webServers.xml new file mode 100644 index 000000000..10e2f62af --- /dev/null +++ b/.idea/webServers.xml @@ -0,0 +1,28 @@ + + + + + + \ No newline at end of file diff --git a/UTILS/.DS_Store b/UTILS/.DS_Store new file mode 100644 index 000000000..182d4cf67 Binary files /dev/null and b/UTILS/.DS_Store differ diff --git a/UTILS/AO2DQuery/AO2Dquery_utils.py b/UTILS/AO2DQuery/AO2Dquery_utils.py new file mode 100644 index 000000000..f48c911c0 --- /dev/null +++ b/UTILS/AO2DQuery/AO2Dquery_utils.py @@ -0,0 +1,50 @@ +import sys +#import shutil +#import os +#from pathlib import Path +import ROOT + +""" +python $O2DPG/UTILS/AO2DQuery/AO2Dquery_utils.py AO2D_Derived_Merged.root $(find /lustre/alice/users/rverma/NOTESData/alice-tpc-notes/Downsampled -iname AO2D_Derived.root| head -n 10 ) +""" + +def merge_root_directories_with_suffix(output_file, input_files): + fout = ROOT.TFile(output_file, "RECREATE") + + for i, fname in enumerate(input_files): + fin = ROOT.TFile.Open(fname) + if not fin or fin.IsZombie(): + print(f"Warning: Could not open {fname}") + continue + + for key in fin.GetListOfKeys(): + dname = key.GetName() + if not dname.startswith("DF"): + continue + + src_dir = fin.Get(dname) + new_dname = f"{dname}__{i}" # Add suffix + + fout.cd() + fout.mkdir(new_dname) + fout.cd(new_dname) + + for subkey in src_dir.GetListOfKeys(): + obj_name = subkey.GetName() + obj = src_dir.Get(obj_name) + + # Clone tree properly + if obj.InheritsFrom("TTree"): + cloned = obj.CloneTree(-1) # deep copy all entries + cloned.SetName(obj_name) + cloned.Write() + else: + obj.Write() + + fin.Close() + fout.Close() + +if __name__ == "__main__": + output = sys.argv[1] + inputs = sys.argv[2:] + merge_root_directories_with_suffix(output, inputs) diff --git a/UTILS/TimeSeries/timeseries_diff.py b/UTILS/TimeSeries/timeseries_diff.py new file mode 100644 index 000000000..803c510fd --- /dev/null +++ b/UTILS/TimeSeries/timeseries_diff.py @@ -0,0 +1,164 @@ +"""timeseries_diff.py +import sys,os; sys.path.insert(1, os.environ[f"O2DPG"]+"/UTILS/TimeSeries"); +from timeseries_diff import * + +Utility helpers for time‑series comparison scripts. +keeping their ROOT files alive. +""" + +import os +import pathlib +from typing import List, Tuple, Optional + +import ROOT # PyROOT + +# --------------------------------------------------------------------------- +# Helper: open many ROOT files and keep them alive +# --------------------------------------------------------------------------- + +def read_time_series(listfile: str = "o2_timeseries_tpc.list",treename: str = "timeSeries",) -> List[Tuple[ROOT.TFile, Optional[ROOT.TTree]]]: + """Read *listfile* containing one ROOT path per line and return a list + of ``(TFile, TTree | None)`` tuples. + The TFile objects are **kept open** (and returned) so the TTrees remain + valid for the caller. Blank lines and lines starting with "#" are + ignored. Environment variables in paths are expanded. + Parameters + ---------- + listfile : str + Text file with ROOT filenames. + treename : str, default "timeSeries" + Name of the tree to retrieve from each file. + Returns + ------- + list of tuples + ``[(f1, tree1), (f2, tree2), ...]`` where *tree* is ``None`` if + the file or tree could not be opened. + """ + files_and_trees: List[Tuple[ROOT.TFile, Optional[ROOT.TTree]]] = [] + + with open(listfile, "r") as fh: + paths = [ln.strip() for ln in fh if ln.strip() and not ln.startswith("#")] + + for raw_path in paths: + path = os.path.expandvars(raw_path) + if not pathlib.Path(path).is_file(): + print(f"[read_time_series] warning: file not found -> {path}") + files_and_trees.append((None, None)) + continue + try: + froot = ROOT.TFile.Open(path) + if not froot or froot.IsZombie(): + raise RuntimeError("file could not be opened") + tree = froot.Get(treename) + if not tree: + print(f"[read_time_series] warning: tree '{treename}' missing in {path}") + files_and_trees.append((froot, tree)) + except Exception as e: + print(f"[read_time_series] error: cannot open {path}: {e}") + files_and_trees.append((None, None)) + + return files_and_trees + +def makeAliases(trees): + for tree in trees: tree[1].AddFriend(trees[0][1],"F") + + +def setStyle(): + ROOT.gStyle.SetOptStat(0) + ROOT.gStyle.SetOptTitle(0) + ROOT.gStyle.SetPalette(ROOT.kRainBow) + ROOT.gStyle.SetPaintTextFormat(".2f") + ROOT.gStyle.SetTextFont(42) + ROOT.gStyle.SetTextSize(0.04) + ROOT.gROOT.ForceStyle() + ROOT.gROOT.SetBatch(True) + + + + + + +# --------------------------------------------------------------------------- +# make_ratios ---------------------------------------------------------------- +# --------------------------------------------------------------------------- + +def make_ratios(trees: list, outdir: str = "fig", pdf_name: str = "ratios.pdf") -> ROOT.TCanvas: + """Create ratio plots *log(var/F.var) vs Iteration$* for each input tree. + * A PNG for every variable / tree is saved to *outdir* + * All canvases are also appended to a multi‑page PDF *pdf_name* + * Vertical guide‑lines mark the logical regions (isector, itgl, iqpt, occu) + + """ + outdir = pathlib.Path(outdir) + outdir.mkdir(parents=True, exist_ok=True) + pdf_path = outdir / pdf_name + + # ------- style / helpers ---------------------------------------------- + ROOT.gStyle.SetOptTitle(1) + canvas = ROOT.TCanvas("c_ratio", "ratio plots", 1200, 600) + lab = ROOT.TLatex() + lab.SetTextSize(0.04) + + # vertical guides in **user** x‑coordinates (Iteration$ axis: 0–128) + vlines = [0, 54, 84, 104, 127] + vnames = ["isector", "itgl", "iqpt", "occupancy"] + vcolors = [ROOT.kRed+1, ROOT.kBlue+1, ROOT.kGreen+2, ROOT.kMagenta+1] + setups=["ref","apass2_closure-test-zAcc.GausSmooth_test3_streamer","apass2_closure-test-zAcc.GausSmooth_test4_streamer","apass2_closure-test-zAcc.GausSmooth_test2_streamer"] + # variables to compare --------------------------------------------------- + vars_ = [ + "mTSITSTPC.mTPCChi2A", "mTSITSTPC.mTPCChi2C", + "mTSTPC.mDCAr_A_NTracks", "mTSTPC.mDCAr_C_NTracks", + "mTSTPC.mTPCNClA", "mTSTPC.mTPCNClC", + "mITSTPCAll.mITSTPC_A_MatchEff", "mITSTPCAll.mITSTPC_C_MatchEff", + "mdEdxQMax.mLogdEdx_A_RMS","mdEdxQMax.mLogdEdx_C_RMS", + "mdEdxQMax.mLogdEdx_A_IROC_RMS","mdEdxQMax.mLogdEdx_C_IROC_RMS" + ] + cut = "mTSITSTPC.mDCAr_A_NTracks > 200" + + # open PDF --------------------------------------------------------------- + canvas.Print(f"{pdf_path}[") # begin multipage + + for setup_index, (_, tree) in enumerate(trees[1:], start=1): + if not tree: + continue + for var in vars_: + expr = f"log({var}/F.{var}):Iteration$" + # 2‑D density histogram + tree.Draw(f"{expr}>>his(128,0,128,50,-0.05,0.05)", cut, "colz") + # profile overlay + tree.Draw(f"{expr}>>hp(128,0,128)", cut, "profsame") + pad = ROOT.gPad + ymin, ymax = -0.05, 0.05 + # keep references so ROOT does not garbage‑collect the guides + guides: list[ROOT.TLine] = [] + for x, txt, col in zip(vlines, vnames, vcolors): + # skip lines outside current x‑range (safety when reusing canvas) + if x < 0 or x > 128:continue + # 1) vertical line in **user** coordinates + ln = ROOT.TLine(x, ymin, x, ymax) + ln.SetLineColor(col) + ln.SetLineStyle(2) + ln.SetLineWidth(5) + ln.Draw() + guides.append(ln) + # 2) text in NDC (pad‑relative) for stable position + x_ndc = pad.XtoPad(x) # already NDC 0‑1 + lab.SetTextColor(col) + lab.DrawLatex(x + 0.02, 0.03, txt) + + # label of the setup on top‑left + lab.SetTextColor(ROOT.kMagenta+2) + lab.DrawLatex(0.15, 0.05, f"Setup {setups[setup_index]}") + canvas.Modified(); canvas.Update() + + # ---------------------------------------------------------------- + tag = var.split('.')[-1] + canvas.SaveAs(str(outdir / f"ratio_{setup_index}_{tag}.png")) + canvas.Print(str(pdf_path)) # add page + + # prevent ROOT from deleting the guides before next Draw() + for ln in guides: + pad.GetListOfPrimitives().Remove(ln) + + canvas.Print(f"{pdf_path}]") # close multipage + return canvas diff --git a/UTILS/dfextensions/AliasDataFrame.md b/UTILS/dfextensions/AliasDataFrame.md new file mode 100644 index 000000000..bf0804941 --- /dev/null +++ b/UTILS/dfextensions/AliasDataFrame.md @@ -0,0 +1,200 @@ +# AliasDataFrame – Hierarchical Lazy Evaluation for Pandas + ROOT + +`AliasDataFrame` is an extension of `pandas.DataFrame` that enables **named expression-based columns (aliases)** with: + +* ✅ **Lazy evaluation** (on-demand computation) +* ✅ **Automatic dependency resolution** (topological sort, cycle detection) +* ✅ **Hierarchical aliasing** across **linked subframes** (e.g. clusters referencing tracks via index joins) +* ✅ **Persistence** to Parquet and ROOT TTree formats, including full alias metadata + +It is designed for physics and data analysis workflows where derived quantities, calibration constants, and multi-table joins should remain symbolic until final export. + +--- + +## ✨ Core Features + +### ✅ Alias Definition & Lazy Evaluation + +Define symbolic columns as expressions involving other columns or aliases: + +```python +adf.add_alias("pt", "sqrt(px**2 + py**2)") +adf.materialize_alias("pt") +``` + +### ✅ Subframe Support (Hierarchical Dependencies) + +Reference a subframe (e.g. per-cluster frame linked to a per-track frame): + +```python +adf_clusters.register_subframe("track", adf_tracks, index_columns="track_index") +adf_tracks.register_subframe("collision", adf_collisions, index_columns="collision_index") + +adf_clusters.add_alias("dX", "mX - track.mX") +adf_clusters.add_alias("vertexZ", "track.collision.z") +``` + +Under the hood, this performs joins using index columns such as `track_index` and `collision_index`, rewrites dotted expressions like `track.mX` and `track.collision.z` to joined columns, and evaluates in that context. + +For example, in ALICE data: + +- clusters reference tracks: `cluster → track` +- tracks reference collisions: `track → collision` +- V0s reference two tracks: `v0 → track1`, `v0 → track2` + +These relations can be declared using `register_subframe()` and used symbolically in aliases. + +### ✅ Dependency Graph & Cycle Detection + +* Automatically resolves dependency order +* Detects and raises on circular alias definitions +* Visualize with: + +```python +adf.plot_alias_dependencies() +``` + +### ✅ Constant Aliases & Dtype Enforcement + +```python +adf.add_alias("scale", "1.5", dtype=np.float32, is_constant=True) +``` + +### ✅ Attribute Access for Aliases and Subframes + +Access aliases and subframe members with convenient dot notation: + +```python +adf.cutHighPt # equivalent to adf["cutHighPt"] +adf.track.pt # evaluates pt from registered subframe "track" +``` + +--- + +## 💾 Persistence + +### ➤ Save to Parquet + +```python +adf.save("data/my_frame") # Saves data + alias metadata +``` + +### ➤ Load from Parquet + +```python +adf2 = AliasDataFrame.load("data/my_frame") +``` + +### ➤ Export to ROOT TTree (with aliases!) + +```python +adf.export_tree("output.root", treename="MyTree") +``` + +### ➤ Import from ROOT TTree + +```python +adf = AliasDataFrame.read_tree("output.root", treename="MyTree") +``` + +Subframe alias metadata (including join indices) is preserved recursively. + +--- + +## 🧪 Unit-Tested Features + +Tests included for: + +* Basic alias chaining and materialization +* Dtype conversion +* Constant and hierarchical aliasing +* Partial materialization +* Subframe joins on index columns +* Chained access via `adf.attr` and `adf.subframe.alias` +* Persistence round-trips for `.parquet` and `.root` +* Error detection: cycles, invalid expressions, undefined symbols + +--- + +## 🧠 Internals + +* Expression evaluation via `eval()` with math/Numpy-safe scope +* Dependency analysis via `networkx` +* Subframes stored in a registry (`SubframeRegistry`) with index-aware entries +* Subframe alias resolution is performed via on-the-fly joins using provided index columns +* Metadata embedded into: + + * `.parquet` via Arrow schema metadata + * `.root` via `TTree::SetAlias` and `TObjString` + +--- + +## 🔍 Introspection & Debugging + +```python +adf.describe_aliases() # Print aliases, dependencies, broken ones +adf.validate_aliases() # List broken/inconsistent aliases +``` + +--- + +## 🧩 Requirements + +* `pandas`, `numpy`, `pyarrow`, `uproot`, `networkx`, `matplotlib`, `ROOT` + +--- + +## 🔁 Comparison with Other Tools + +| Feature | AliasDataFrame | pandas | Vaex | Awkward Arrays | polars | Dask | +| ----------------------------- | -------------- | --------- | -------- | -------------- | --------- | --------- | +| Lazy alias columns | ✅ Yes | ⚠️ Manual | ✅ Yes | ❌ | ✅ Partial | ✅ Partial | +| Dependency tracking | ✅ Full graph | ❌ | ⚠️ Basic | ❌ | ❌ | ❌ | +| Subframe hierarchy (joins) | ✅ Index-based | ❌ | ❌ | ⚠️ Nested only | ❌ | ⚠️ Manual | +| Constant alias support | ✅ With dtype | ❌ | ❌ | ❌ | ❌ | ❌ | +| Visualization of dependencies | ✅ `networkx` | ❌ | ❌ | ❌ | ❌ | ❌ | +| Export to ROOT TTree | ✅ Optional | ❌ | ❌ | ✅ via uproot | ❌ | ❌ | + +--- + +## ❓ Why AliasDataFrame? + +In many data workflows, users recreate the same patterns again and again: + +* Manually compute derived columns with ad hoc logic +* Scatter constants and correction factors in multiple files +* Perform fragile joins between tables (e.g. clusters ↔ tracks) with little traceability +* Lose transparency into what each column actually means + +**AliasDataFrame** turns these practices into a formalized, symbolic layer over your DataFrames: + +* 📐 Define all derived quantities as symbolic expressions +* 🔗 Keep relations between DataFrames declarative, index-based, and reusable +* 📊 Visualize dependency structures and broken logic automatically +* 📦 Export the full state of your workflow (including symbolic metadata) + +It brings the clarity of a computation graph to structured table analysis — a common but under-supported need in `pandas`, `vaex`, or `polars` workflows. + +--- + +## 🛣 Roadmap Ideas + +* [ ] Secure expression parser (no raw `eval`) +* [ ] Aliased column caching / invalidation strategy +* [ ] Inter-subframe join strategies (e.g., key-based, 1: n) +* [ ] Jupyter widget or CLI tool for alias graph exploration +* [ ] Broadcasting-aware joins or 2D index support + +--- + +## 🧑‍🔬 Designed for... + +* Physics workflows (e.g. ALICE Physics analysis V0 ↔ tracks ↔ collisions) +* Symbolic calibration / correction workflows +* Structured data exports with traceable metadata + +--- + +**Author:** Marian Ivanov + +MIT License diff --git a/UTILS/dfextensions/AliasDataFrame.py b/UTILS/dfextensions/AliasDataFrame.py new file mode 100644 index 000000000..1ba9f8263 --- /dev/null +++ b/UTILS/dfextensions/AliasDataFrame.py @@ -0,0 +1,454 @@ +import sys, os; sys.path.insert(1, os.environ.get("O2DPG", "") + "/UTILS/dfextensions") +import pandas as pd +import numpy as np +import json +import uproot +import ROOT # type: ignore +import matplotlib.pyplot as plt +import networkx as nx +import re +import ast + +class SubframeRegistry: + """ + Registry to manage subframes (nested AliasDataFrame instances). + """ + def __init__(self): + self.subframes = {} # name → {'frame': adf, 'index': index_columns} + + def add_subframe(self, name, alias_df, index_columns, pre_index=False): + if pre_index and not alias_df.df.index.names == index_columns: + alias_df.df.set_index(index_columns, inplace=True) + self.subframes[name] = {'frame': alias_df, 'index': index_columns} + + def get(self, name): + return self.subframes.get(name, {}).get('frame', None) + + def get_entry(self, name): + return self.subframes.get(name, None) + + def items(self): + return self.subframes.items() + + +def convert_expr_to_root(expr): + class RootTransformer(ast.NodeTransformer): + FUNC_MAP = { + "arctan2": "atan2", + "mod": "fmod", + "sqrt": "sqrt", + "log": "log", + "log10": "log10", + "exp": "exp", + "abs": "abs", + "power": "pow", + "maximum": "TMath::Max", + "minimum": "TMath::Min" + } + + def visit_Call(self, node): + def get_func_name(n): + if isinstance(n, ast.Attribute): + return n.attr + elif isinstance(n, ast.Name): + return n.id + return "" + + func_name = get_func_name(node.func) + root_func = self.FUNC_MAP.get(func_name, func_name) + + node.args = [self.visit(arg) for arg in node.args] + node.func = ast.Name(id=root_func, ctx=ast.Load()) + return node + + try: + expr_clean = re.sub(r"\bnp\\.", "", expr) + tree = ast.parse(expr_clean, mode='eval') + tree = RootTransformer().visit(tree) + ast.fix_missing_locations(tree) + return ast.unparse(tree) + except Exception: + return expr + +class AliasDataFrame: + """ + AliasDataFrame allows for defining and evaluating lazy-evaluated column aliases + on top of a pandas DataFrame, including nested subframes with hierarchical indexing. + """ + def __init__(self, df): + if not isinstance(df, pd.DataFrame): + raise TypeError( + f"AliasDataFrame must be initialized with a pandas.DataFrame. " + f"Received type: {type(df)}" + ) + self.df = df + self.aliases = {} + self.alias_dtypes = {} + self.constant_aliases = set() + self._subframes = SubframeRegistry() + + def __getattr__(self, item: str): + if item in self.df.columns: + return self.df[item] + if item in self.aliases: + self.materialize_alias(item) + return self.df[item] + sf = self._subframes.get(item) + if sf is not None: + return sf + raise AttributeError(f"'{type(self).__name__}' object has no attribute '{item}'") + + + def register_subframe(self, name, adf, index_columns, pre_index=False): + self._subframes.add_subframe(name, adf, index_columns, pre_index=pre_index) + + def get_subframe(self, name): + return self._subframes.get(name) + + def _default_functions(self): + import math + env = {k: getattr(math, k) for k in dir(math) if not k.startswith("_")} + env.update({k: getattr(np, k) for k in dir(np) if not k.startswith("_")}) + env["np"] = np + for sf_name, sf_entry in self._subframes.items(): + env[sf_name] = sf_entry['frame'] + # Custom compatibility for SetAlias-like expressions + env["int"] = lambda x: np.array(x).astype(np.int32) + env["uint"] = lambda x: np.array(x).astype(np.uint32) + env["float"] = lambda x: np.array(x).astype(np.float32) + env["round"] = np.round + env["clip"] = np.clip + return env + + def _prepare_subframe_joins(self, expr): + tokens = re.findall(r'(\b\w+)\.(\w+)', expr) + for sf_name, sf_col in tokens: + entry = self._subframes.get_entry(sf_name) + if not entry: + continue + sub_adf = entry['frame'] + sub_df = sub_adf.df + index_cols = entry['index'] + if isinstance(index_cols, str): + index_cols = [index_cols] + merge_cols = index_cols + [sf_col] + suffix = f'__{sf_name}' + + try: + cols_to_merge = sub_df[merge_cols] + except KeyError: + if sf_col in sub_adf.aliases: + sub_adf.materialize_alias(sf_col) + sub_df = sub_adf.df + cols_to_merge = sub_df[merge_cols] + else: + raise KeyError(f"Subframe '{sf_name}' does not contain or define alias '{sf_col}'") + + joined = self.df.merge(cols_to_merge, on=index_cols, suffixes=('', suffix)) + col_renamed = f'{sf_col}{suffix}' + if col_renamed in joined.columns: + self.df[col_renamed] = joined[col_renamed].values + expr = expr.replace(f'{sf_name}.{sf_col}', col_renamed) + return expr + + def _check_for_cycles(self): + try: + self._topological_sort() + except ValueError as e: + raise ValueError("Cycle detected in alias dependencies") from e + + def add_alias(self, name, expression, dtype=None, is_constant=False): + """ + Define a new alias. + Args: + name: Name of the alias. + expression: Expression string using pandas or NumPy operations. + dtype: Optional numpy dtype to enforce. + is_constant: Whether the alias represents a scalar constant. + """ + self.aliases[name] = expression + if dtype is not None: + self.alias_dtypes[name] = dtype + if is_constant: + self.constant_aliases.add(name) + self._check_for_cycles() + + def _eval_in_namespace(self, expr): + expr = self._prepare_subframe_joins(expr) + local_env = {col: self.df[col] for col in self.df.columns} + local_env.update(self._default_functions()) + return eval(expr, {}, local_env) + + def _resolve_dependencies(self): + from collections import defaultdict + dependencies = defaultdict(set) + for name, expr in self.aliases.items(): + tokens = re.findall(r'\b\w+\b', expr) + for token in tokens: + if token in self.aliases: + dependencies[name].add(token) + return dependencies + + def _check_for_cycles(self): + graph = nx.DiGraph() + for name, deps in self._resolve_dependencies().items(): + for dep in deps: + graph.add_edge(dep, name) + try: + list(nx.topological_sort(graph)) + except nx.NetworkXUnfeasible: + raise ValueError("Cycle detected in alias dependencies") + + def plot_alias_dependencies(self): + deps = self._resolve_dependencies() + G = nx.DiGraph() + for alias, subdeps in deps.items(): + for dep in subdeps: + G.add_edge(dep, alias) + pos = nx.spring_layout(G) + plt.figure(figsize=(10, 6)) + nx.draw(G, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=2000, font_size=10, arrows=True) + plt.title("Alias Dependency Graph") + plt.show() + + def _topological_sort(self): + from collections import defaultdict, deque + self._check_for_cycles() + dependencies = self._resolve_dependencies() + reverse_deps = defaultdict(set) + indegree = defaultdict(int) + for alias, deps in dependencies.items(): + indegree[alias] = len(deps) + for dep in deps: + reverse_deps[dep].add(alias) + queue = deque([alias for alias in self.aliases if indegree[alias] == 0]) + result = [] + while queue: + node = queue.popleft() + result.append(node) + for dependent in reverse_deps[node]: + indegree[dependent] -= 1 + if indegree[dependent] == 0: + queue.append(dependent) + if len(result) != len(self.aliases): + raise ValueError("Cycle detected in alias dependencies") + return result + + def validate_aliases(self): + broken = [] + for name, expr in self.aliases.items(): + try: + self._eval_in_namespace(expr) + except Exception: + broken.append(name) + return broken + + def describe_aliases(self): + print("Aliases:") + for name, expr in self.aliases.items(): + print(f" {name}: {expr}") + broken = self.validate_aliases() + if broken: + print("\nBroken Aliases:") + for name in broken: + print(f" {name}") + print("\nDependencies:") + deps = self._resolve_dependencies() + for k, v in deps.items(): + print(f" {k}: {sorted(v)}") + + def materialize_alias(self, name, cleanTemporary=False, dtype=None): + """ + Evaluate an alias and store its result as a real column. + Args: + name: Alias name to materialize. + cleanTemporary: Whether to clean up intermediate dependencies. + dtype: Optional override dtype to cast to. + + Raises: + KeyError: If alias is not defined. + Exception: If alias evaluation fails. + """ + if name not in self.aliases: + print(f"[materialize_alias] Warning: alias '{name}' not found.") + return + expr = self.aliases[name] + + # Automatically materialize any referenced aliases or subframe aliases + tokens = re.findall(r'\b\w+\b|\w+\.\w+', expr) + for token in tokens: + if '.' in token: + sf_name, sf_attr = token.split('.', 1) + sf = self.get_subframe(sf_name) + if sf and sf_attr in sf.aliases and sf_attr not in sf.df.columns: + sf.materialize_alias(sf_attr) + elif token in self.aliases and token not in self.df.columns: + self.materialize_alias(token) + + result = self._eval_in_namespace(expr) + result_dtype = dtype or self.alias_dtypes.get(name) + if result_dtype is not None: + try: + result = result.astype(result_dtype) + except AttributeError: + result = result_dtype(result) + self.df[name] = result + + def materialize_aliases(self, targets, cleanTemporary=True, verbose=False): + import networkx as nx + def build_graph(): + g = nx.DiGraph() + for alias, expr in self.aliases.items(): + for token in re.findall(r'\b\w+\b', expr): + if token in self.aliases: + g.add_edge(token, alias) + return g + g = build_graph() + required = set() + for t in targets: + if t not in self.aliases: + if verbose: + print(f"[materialize_aliases] Skipping non-alias target: {t}") + continue + if t not in g: + if verbose: + print(f"[materialize_aliases] Alias '{t}' not in graph") + continue + try: + required |= nx.ancestors(g, t) + except nx.NetworkXError: + continue + required.add(t) + ordered = list(nx.topological_sort(g.subgraph(required))) + added = [] + for name in ordered: + if name not in self.df.columns: + self.materialize_alias(name) + added.append(name) + if cleanTemporary: + for col in added: + if col not in targets and col in self.df.columns: + self.df.drop(columns=[col], inplace=True) + return added + + def materialize_all(self): + self._check_for_cycles() + for name in self.aliases: + self.materialize_alias(name) + + def save(self, path_prefix, dropAliasColumns=True): + import pyarrow as pa + import pyarrow.parquet as pq + if dropAliasColumns: + cols = [c for c in self.df.columns if c not in self.aliases] + else: + cols = list(self.df.columns) + table = pa.Table.from_pandas(self.df[cols]) + metadata = { + "aliases": json.dumps(self.aliases), + "dtypes": json.dumps({k: v.__name__ for k, v in self.alias_dtypes.items()}), + "constants": json.dumps(list(self.constant_aliases)) + } + existing_meta = table.schema.metadata or {} + combined_meta = existing_meta.copy() + combined_meta.update({k.encode(): v.encode() for k, v in metadata.items()}) + table = table.replace_schema_metadata(combined_meta) + pq.write_table(table, f"{path_prefix}.parquet", compression="zstd") + + @staticmethod + def load(path_prefix): + import pyarrow.parquet as pq + table = pq.read_table(f"{path_prefix}.parquet") + df = table.to_pandas() + adf = AliasDataFrame(df) + meta = table.schema.metadata or {} + if b"aliases" in meta and b"dtypes" in meta: + adf.aliases = json.loads(meta[b"aliases"].decode()) + adf.alias_dtypes = {k: getattr(np, v) for k, v in json.loads(meta[b"dtypes"].decode()).items()} + if b"constants" in meta: + adf.constant_aliases = set(json.loads(meta[b"constants"].decode())) + return adf + + def export_tree(self, filename_or_file, treename="tree", dropAliasColumns=True,compression=uproot.ZLIB(level=1)): + """ + uproot.LZMA(level=5) + :param filename_or_file: + :param treename: + :param dropAliasColumns: + :param compression: + :return: + """ + is_path = isinstance(filename_or_file, str) + + if is_path: + with uproot.recreate(filename_or_file,compression=compression) as f: + self._write_to_uproot(f, treename, dropAliasColumns) + self._write_metadata_to_root(filename_or_file, treename) + else: + self._write_to_uproot(filename_or_file, treename, dropAliasColumns) + for subframe_name, entry in self._subframes.items(): + entry["frame"]._write_metadata_to_root(filename_or_file, f"{treename}__subframe__{subframe_name}") + + def _write_to_uproot(self, uproot_file, treename, dropAliasColumns): + export_cols = [col for col in self.df.columns if not dropAliasColumns or col not in self.aliases] + dtype_casts = {col: np.float32 for col in export_cols if self.df[col].dtype == np.float16} + export_df = self.df[export_cols].astype(dtype_casts) + + uproot_file[treename] = export_df + + for subframe_name, entry in self._subframes.items(): + entry["frame"].export_tree(uproot_file, f"{treename}__subframe__{subframe_name}", dropAliasColumns) + + def _write_metadata_to_root(self, filename, treename): + f = ROOT.TFile.Open(filename, "UPDATE") + tree = f.Get(treename) + for alias, expr in self.aliases.items(): + try: + val = float(expr) + expr_str = f"({val}+0)" + except Exception: + expr_str = convert_expr_to_root(expr) + tree.SetAlias(alias, expr_str) + metadata = { + "aliases": self.aliases, + "subframe_indices": {k: v["index"] for k, v in self._subframes.items()}, + "dtypes": {k: v.__name__ for k, v in self.alias_dtypes.items()}, + "constants": list(self.constant_aliases), + "subframes": list(self._subframes.subframes.keys()) + } + jmeta = json.dumps(metadata) + tree.GetUserInfo().Add(ROOT.TObjString(jmeta)) + tree.Write("", ROOT.TObject.kOverwrite) + f.Close() + + @staticmethod + def read_tree(filename, treename="tree"): + with uproot.open(filename) as f: + df = f[treename].arrays(library="pd") + adf = AliasDataFrame(df) + f = ROOT.TFile.Open(filename) + try: + tree = f.Get(treename) + for alias in tree.GetListOfAliases(): + adf.aliases[alias.GetName()] = alias.GetTitle() + user_info = tree.GetUserInfo() + for i in range(user_info.GetEntries()): + obj = user_info.At(i) + if isinstance(obj, ROOT.TObjString): + try: + jmeta = json.loads(obj.GetString().Data()) + adf.aliases.update(jmeta.get("aliases", {})) + adf.alias_dtypes.update({k: getattr(np, v) for k, v in jmeta.get("dtypes", {}).items()}) + adf.constant_aliases.update(jmeta.get("constants", [])) + for sf_name in jmeta.get("subframes", []): + sf = AliasDataFrame.read_tree(filename, treename=f"{treename}__subframe__{sf_name}") + index = jmeta.get("subframe_indices", {}).get(sf_name) + if index is None: + raise ValueError(f"Missing index_columns for subframe '{sf_name}' in metadata") + adf.register_subframe(sf_name, sf, index_columns=index) + break + except Exception: + pass + finally: + f.Close() + return adf diff --git a/UTILS/dfextensions/AliasDataFrameTest.py b/UTILS/dfextensions/AliasDataFrameTest.py new file mode 100644 index 000000000..08612fb79 --- /dev/null +++ b/UTILS/dfextensions/AliasDataFrameTest.py @@ -0,0 +1,223 @@ +import unittest +import pandas as pd +import numpy as np +import os +from AliasDataFrame import AliasDataFrame # Adjust if needed +import tempfile + +class TestAliasDataFrame(unittest.TestCase): + def setUp(self): + df = pd.DataFrame({ + "x": np.arange(5), + "y": np.arange(5, 10), + "CTPLumi_countsFV0": np.array([2000, 2100, 2200, 2300, 2400]) + }) + self.adf = AliasDataFrame(df) + + def test_basic_alias(self): + self.adf.add_alias("z", "x + y") + self.adf.materialize_all() + expected = self.adf.df["x"] + self.adf.df["y"] + pd.testing.assert_series_equal(self.adf.df["z"], expected, check_names=False) + + def test_dtype(self): + self.adf.add_alias("z", "x + y", dtype=np.float16) + self.adf.materialize_all() + self.assertEqual(self.adf.df["z"].dtype, np.float16) + + def test_constant(self): + self.adf.add_alias("c", "42.0", dtype=np.float32, is_constant=True) + self.adf.add_alias("z", "x + c") + self.adf.materialize_all() + expected = self.adf.df["x"] + 42.0 + pd.testing.assert_series_equal(self.adf.df["z"], expected, check_names=False) + + def test_dependency_order(self): + self.adf.add_alias("a", "x + y") + self.adf.add_alias("b", "a * 2") + self.adf.materialize_all() + expected = (self.adf.df["x"] + self.adf.df["y"]) * 2 + pd.testing.assert_series_equal(self.adf.df["b"], expected, check_names=False) + + def test_log_rate_with_constant(self): + median = self.adf.df["CTPLumi_countsFV0"].median() + self.adf.add_alias("countsFV0_median", f"{median}", dtype=np.float16, is_constant=True) + self.adf.add_alias("logRate", "log(CTPLumi_countsFV0/countsFV0_median)", dtype=np.float16) + self.adf.materialize_all() + expected = np.log(self.adf.df["CTPLumi_countsFV0"] / median).astype(np.float16) + pd.testing.assert_series_equal(self.adf.df["logRate"], expected, check_names=False) + + def test_circular_dependency_raises_error(self): + self.adf.add_alias("a", "b * 2") + with self.assertRaises(ValueError): + self.adf.add_alias("b", "a + 1") + + def test_undefined_symbol_raises_error(self): + self.adf.add_alias("z", "x + non_existent_variable") + with self.assertRaises(Exception): + self.adf.materialize_all() + + def test_invalid_syntax_raises_error(self): + self.adf.add_alias("z", "x +* y") + with self.assertRaises(SyntaxError): + self.adf.materialize_all() + + def test_partial_materialization(self): + self.adf.add_alias("a", "x + 1") + self.adf.add_alias("b", "a + 1") + self.adf.add_alias("c", "y + 1") + self.adf.materialize_alias("b") + self.assertIn("a", self.adf.df.columns) + self.assertIn("b", self.adf.df.columns) + self.assertNotIn("c", self.adf.df.columns) + + def test_export_import_tree_roundtrip(self): + df = pd.DataFrame({ + "x": np.linspace(0, 10, 100), + "y": np.linspace(10, 20, 100) + }) + adf = AliasDataFrame(df) + adf.add_alias("z", "x + y", dtype=np.float64) + adf.materialize_all() + + with tempfile.NamedTemporaryFile(suffix=".root", delete=False) as tmp: + adf.export_tree(tmp.name, treename="testTree", dropAliasColumns=False) + tmp_path = tmp.name + + adf_loaded = AliasDataFrame.read_tree(tmp_path, treename="testTree") + + assert "z" in adf_loaded.aliases + assert adf_loaded.aliases["z"] == "x + y" + adf_loaded.materialize_alias("z") + pd.testing.assert_series_equal(adf.df["z"], adf_loaded.df["z"], check_names=False) + + os.remove(tmp_path) + def test_getattr_column_and_alias_access(self): + df = pd.DataFrame({ + "x": np.arange(5), + "y": np.arange(5) * 2 + }) + adf = AliasDataFrame(df) + adf.add_alias("z", "x + y", dtype=np.int32) + + # Access real column + assert (adf.x == df["x"]).all() + # Access alias before materialization + assert "z" not in adf.df.columns + z_val = adf.z + assert "z" in adf.df.columns + expected = df["x"] + df["y"] + np.testing.assert_array_equal(z_val, expected) + + +class TestAliasDataFrameWithSubframes(unittest.TestCase): + def setUp(self): + n_tracks = 1000 + n_clusters = 100 + df_tracks = pd.DataFrame({ + "track_index": np.arange(n_tracks), + "mX": np.random.normal(0, 10, n_tracks), + "mY": np.random.normal(0, 10, n_tracks), + "mZ": np.random.normal(0, 10, n_tracks), + "mPt": np.random.exponential(1.0, n_tracks), + "mEta": np.random.normal(0, 1, n_tracks), + }) + + cluster_idx = np.repeat(df_tracks["track_index"], n_clusters) + df_clusters = pd.DataFrame({ + "track_index": cluster_idx, + "mX": np.random.normal(0, 10, len(cluster_idx)), + "mY": np.random.normal(0, 10, len(cluster_idx)), + "mZ": np.random.normal(0, 10, len(cluster_idx)), + }) + + self.df_tracks = df_tracks + self.df_clusters = df_clusters + + def test_alias_cluster_track_dx(self): + adf_clusters = AliasDataFrame(self.df_clusters.copy()) + adf_tracks = AliasDataFrame(self.df_tracks.copy()) + adf_clusters.register_subframe("T", adf_tracks, index_columns="track_index") + adf_clusters.add_alias("mDX", "mX - T.mX") + adf_clusters.materialize_all() + merged = adf_clusters.df.merge(adf_tracks.df, on="track_index", suffixes=("", "_trk")) + expected = merged["mX"] - merged["mX_trk"] + pd.testing.assert_series_equal(adf_clusters.df["mDX"].reset_index(drop=True), expected.reset_index(drop=True), check_names=False) + + def test_subframe_invalid_alias_raises(self): + adf_clusters = AliasDataFrame(self.df_clusters.copy()) + adf_tracks = AliasDataFrame(self.df_tracks.copy()) + adf_clusters.register_subframe("T", adf_tracks, index_columns="track_index") + adf_clusters.add_alias("invalid", "T.nonexistent") + + with self.assertRaises(KeyError) as cm: + adf_clusters.materialize_alias("invalid") + + self.assertIn("T", str(cm.exception)) + self.assertIn("nonexistent", str(cm.exception)) + + def test_save_and_load_integrity(self): + adf_clusters = AliasDataFrame(self.df_clusters.copy()) + adf_tracks = AliasDataFrame(self.df_tracks.copy()) + adf_clusters.register_subframe("T", adf_tracks, index_columns="track_index") + adf_clusters.add_alias("mDX", "mX - T.mX") + adf_clusters.materialize_all() + + with tempfile.TemporaryDirectory() as tmpdir: + path_clusters = os.path.join(tmpdir, "clusters.parquet") + path_tracks = os.path.join(tmpdir, "tracks.parquet") + adf_clusters.save(path_clusters) + adf_tracks.save(path_tracks) + + adf_tracks_loaded = AliasDataFrame.load(path_tracks) + adf_clusters_loaded = AliasDataFrame.load(path_clusters) + adf_clusters_loaded.register_subframe("T", adf_tracks_loaded, index_columns="track_index") + adf_clusters_loaded.add_alias("mDX", "mX - T.mX") + adf_clusters_loaded.materialize_all() + + self.assertIn("mDX", adf_clusters_loaded.df.columns) + merged = adf_clusters_loaded.df.merge(adf_tracks_loaded.df, on="track_index", suffixes=("", "_trk")) + expected = merged["mX"] - merged["mX_trk"] + pd.testing.assert_series_equal(adf_clusters_loaded.df["mDX"].reset_index(drop=True), expected.reset_index(drop=True), check_names=False) + self.assertDictEqual(adf_clusters.aliases, adf_clusters_loaded.aliases) + + def test_getattr_subframe_alias_access(self): + # Parent frame + df_main = pd.DataFrame({"track_id": [0, 1, 2], "x": [10, 20, 30]}) + adf_main = AliasDataFrame(df_main) + # Subframe with alias + df_sub = pd.DataFrame({"track_id": [0, 1, 2], "residual": [1.1, 2.2, 3.3]}) + adf_sub = AliasDataFrame(df_sub) + adf_sub.add_alias("residual_scaled", "residual * 100", dtype=np.float64) + + # Register subframe + adf_main.register_subframe("track", adf_sub, index_columns="track_id") + + # Add alias depending on subframe alias + adf_main.add_alias("resid100", "track.residual_scaled", dtype=np.float64) + + # Trigger materialization via __getattr__ + assert "resid100" not in adf_main.df.columns + result = adf_main.resid100 + assert "resid100" in adf_main.df.columns + np.testing.assert_array_equal(result, df_sub["residual"] * 100) + + + + def test_getattr_chained_subframe_access(self): + df_main = pd.DataFrame({"id": [0, 1, 2]}) + df_sub = pd.DataFrame({"id": [0, 1, 2], "a": [5, 6, 7]}) + adf_main = AliasDataFrame(df_main) + adf_sub = AliasDataFrame(df_sub) + adf_sub.add_alias("cutA", "a > 5") + adf_main.register_subframe("sub", adf_sub, index_columns="id") + + adf_sub.materialize_alias("cutA") + + # Check chained access + expected = np.array([False, True, True]) + assert np.all(adf_main.sub.cutA == expected) # explicit value check + + +if __name__ == "__main__": + unittest.main() diff --git a/UTILS/dfextensions/DataFrameUtils.py b/UTILS/dfextensions/DataFrameUtils.py new file mode 100644 index 000000000..ec794ebc0 --- /dev/null +++ b/UTILS/dfextensions/DataFrameUtils.py @@ -0,0 +1,469 @@ +""" +# export O2DPG=~/alicesw/O2DPG/ +import sys, os; sys.path.insert(1, os.environ.get("O2DPG", "") + "/UTILS/dfextensions") + +""" + +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from matplotlib.colors import ListedColormap +from collections import OrderedDict + +def df_draw_scatter( + df, + expr, + selection=None, # str (pandas query), bool mask, or callable(df)->mask + color=None, # None | column name + marker=None, # None | column name for size + cmap="tab10", + jitter=False, + show=True # if False, don't plt.show(); always return (fig, ax) +): + """ + Create a scatter plot from a DataFrame with optional color, marker size, and jitter. + + Parameters + ---------- + df : pandas.DataFrame + Input DataFrame containing the data. + expr : str + Expression in 'y:x' format specifying y-axis and x-axis columns (e.g., 'sigma:pTmin'). + selection : str, bool array, or callable, optional + Filter to apply. Can be a pandas query string (engine='python'), a boolean mask, + or a callable returning a mask (default: None, uses full df). + color : str, optional + Column name for color mapping (continuous or categorical, default: None). + marker : str, optional + Column name for marker size mapping (numeric, default: None). + cmap : str, optional + Colormap name (e.g., 'tab10', default: 'tab10'). + jitter : bool, optional + Add small random jitter to x and y coordinates (default: False). + show : bool, optional + Display the plot if True (default: True); always returns (fig, ax). + + Returns + ------- + tuple + (fig, ax) : matplotlib Figure and Axes objects for further customization. + + Raises + ------ + ValueError + If expr is not in 'y:x' format or selection query fails. + TypeError + If selection is neither str, bool array, nor callable. + + Notes + ----- + - Filters NA values from x and y before plotting. + - Jitter helps visualize quantized data (x: ±0.1, y: ±2e-4). + - Colorbar is added for continuous color; categorical colors use the first color for NA. + """ + # --- parse "y:x" + try: + y_col, x_col = expr.split(":") + except ValueError: + raise ValueError("expr must be 'y:x'") + + # --- selection: str | mask | callable + if selection is None: + df_plot = df + elif isinstance(selection, str): + # engine='python' allows .str.contains() etc. + df_plot = df.query(selection, engine="python") + elif callable(selection): + df_plot = df[selection(df)] + else: + # assume boolean mask-like + df_plot = df[selection] + + # --- numeric x/y with NA filtering + x = pd.to_numeric(df_plot[x_col], errors="coerce") + y = pd.to_numeric(df_plot[y_col], errors="coerce") + valid = x.notna() & y.notna() + df_plot, x, y = df_plot[valid], x[valid], y[valid] + + # --- optional jitter (useful when values are quantized) + if jitter: + x = x + np.random.uniform(-0.1, 0.1, len(x)) + y = y + np.random.uniform(-2e-4, 2e-4, len(y)) + + # --- color handling + if color: + col_data = df_plot[color] + if col_data.dtype == "object": + cats = pd.Categorical(col_data) + c_vals = cats.codes # -1 for NaN; handle below + # build a discrete colormap large enough + base = plt.get_cmap(cmap) + n = max(cats.categories.size, 1) + c_map = ListedColormap([base(i % base.N) for i in range(n)]) + # replace -1 with 0 to plot (will map to first color) + c_plot = np.where(c_vals < 0, 0, c_vals) + colorbar_mode = "categorical" + categories = list(cats.categories) + else: + c_plot = pd.to_numeric(col_data, errors="coerce").fillna(method="pad") + c_map = plt.get_cmap(cmap) + colorbar_mode = "continuous" + categories = None + else: + c_plot = "tab:blue" + c_map = None + colorbar_mode = None + categories = None + + # --- marker size + if marker: + m_data = pd.to_numeric(df_plot[marker], errors="coerce") + m_min, m_max = m_data.min(), m_data.max() + # safe normalize + denom = (m_max - m_min) if (m_max > m_min) else 1.0 + sizes = 100 + (m_data - m_min) / denom * 300 + else: + sizes = 150 + + # --- plotting + fig, ax = plt.subplots(figsize=(8, 6)) + scatter = ax.scatter( + x, y, + c=c_plot, + s=sizes, + cmap=c_map, + alpha=0.7, + linewidths=0.5, # avoids edgecolor warning + edgecolors="k" + ) + + ax.set_xlim(x.min() - 0.5, x.max() + 0.5) + pad_y = max(1e-4, 0.02 * (y.max() - y.min())) + ax.set_ylim(y.min() - pad_y, y.max() + pad_y) + + ax.set_xlabel(x_col) + ax.set_ylabel(y_col) + ax.set_title(f"Scatter: {y_col} vs {x_col}") + ax.grid(True, alpha=0.3) + + # --- colorbar for continuous / categorical labels + if color and colorbar_mode: + cbar = plt.colorbar(scatter, ax=ax) + if colorbar_mode == "categorical" and categories is not None: + cbar.set_ticks(np.arange(len(categories))) + cbar.set_ticklabels(categories) + cbar.set_label(color) + + if show: + plt.show() + + return fig, ax + + +def df_draw_scatter_categorical( + df: pd.DataFrame, + expr: str, + selection: str = None, # pandas query string ONLY (engine="python") + color: str = None, # categorical column -> COLORS + marker_style: str = None, # categorical column -> MARKER SHAPES + marker_size=None, # None | "" | number | column name + jitter: bool = False, + # category controls + top_k_color: int = None, # keep top-K colors, rest -> other_label_color + other_label_color: str = "Other", + order_color: list = None, # explicit order for color legend + top_k_marker: int = None, # keep top-K marker cats, rest -> other_label_marker + other_label_marker: str = "Other", + order_marker: list = None, # explicit order for marker legend + # palettes / markers + palette: list = None, # list of color specs; defaults to repeating tab20 + markers: list = None, # list of marker styles; defaults to common shapes + # legends & layout + legend_outside: bool = True, # put legends outside plot and reserve margin + legend_cols_color: int = 1, + legend_cols_marker: int = 1, + show: bool = False, +): + """ + Create a scatter plot with categorical colors and marker shapes from a DataFrame. + + Parameters + ---------- + df : pandas.DataFrame + Input DataFrame containing the data. + expr : str + Expression in 'y:x' format specifying y-axis and x-axis columns (e.g., 'sigma:pTmin'). + selection : str, optional + Pandas query string (engine='python') to filter data (e.g., "productionId.str.contains(...)"). + color : str, optional + Column name for categorical color mapping (legend #1, default: None). + marker_style : str, optional + Column name for categorical marker shape mapping (legend #2, default: None). + marker_size : None | "" | number | str, optional + - None or "" : Constant size (150 pt²). + - number : Fixed size (pt²) for all points. + - str : Column name; numeric values normalized to [100, 400] pt², categorical cycled (150, 220, ...). + jitter : bool, default False + Add small uniform jitter to x and y coordinates. + top_k_color : int, optional + Keep top-K color categories, others mapped to `other_label_color` (default: None). + other_label_color : str, default "Other" + Label for non-top-K color categories. + order_color : list, optional + Explicit order for color legend categories (default: by frequency). + top_k_marker : int, optional + Keep top-K marker categories, others mapped to `other_label_marker` (default: None). + other_label_marker : str, default "Other" + Label for non-top-K marker categories. + order_marker : list, optional + Explicit order for marker legend categories (default: by frequency). + palette : list, optional + List of color specs to cycle (default: repeats 'tab20'). + markers : list, optional + List of marker styles (default: ["o", "s", "^", ...]). + legend_outside : bool, default True + Place legends outside plot, reserving right margin. + legend_cols_color : int, default 1 + Number of columns in color legend. + legend_cols_marker : int, default 1 + Number of columns in marker legend. + show : bool, default True + Display the plot if True (default: True); always returns (fig, ax). + + Returns + ------- + tuple + (fig, ax) : matplotlib Figure and Axes objects. + + Raises + ------ + ValueError + If expr is not 'y:x' format or selection query fails. + TypeError + If selection is not a string or marker_size is invalid. + + Notes + ----- + - Designed for ALICE data visualization (e.g., D0 resolution plots). + - Filters NA values and handles categorical data robustly. + - Legends are added outside to avoid clipping; adjust `bbox_to_anchor` if needed. + """ + # --- parse "y:x" + try: + y_col, x_col = expr.split(":") + except Exception as e: + raise ValueError("expr must be in 'y:x' format, e.g. 'sigma:pTmin'") from e + + # --- selection via pandas query + if selection is None: + df_plot = df + else: + if not isinstance(selection, str): + raise TypeError("selection must be a pandas query string (str).") + try: + df_plot = df.query(selection, engine="python") + except Exception as e: + raise ValueError(f"selection query failed: {selection}\n{e}") from e + + # --- numeric x/y with NA filtering + x = pd.to_numeric(df_plot[x_col], errors="coerce") + y = pd.to_numeric(df_plot[y_col], errors="coerce") + valid = x.notna() & y.notna() + df_plot, x, y = df_plot[valid], x[valid], y[valid] + + if jitter: + x = x + np.random.uniform(-0.1, 0.1, len(x)) + y = y + np.random.uniform(-2e-4, 2e-4, len(y)) + + # --- marker size handling + DEFAULT_SIZE = 150.0 # pt^2 + if marker_size is None or (isinstance(marker_size, str) and marker_size == ""): + sizes = np.full(len(df_plot), DEFAULT_SIZE, dtype=float) + elif isinstance(marker_size, (int, float)): + sizes = np.full(len(df_plot), float(marker_size), dtype=float) + elif isinstance(marker_size, str): + ms = df_plot[marker_size] + if pd.api.types.is_numeric_dtype(ms): + m = pd.to_numeric(ms, errors="coerce") + mmin, mmax = m.min(), m.max() + denom = (mmax - mmin) if (mmax > mmin) else 1.0 + sizes = 100.0 + (m - mmin) / denom * 300.0 + sizes = sizes.fillna(DEFAULT_SIZE).to_numpy(dtype=float) + else: + # categorical → cycle sizes + cats = ms.astype("string").fillna("NA").value_counts().index.tolist() + base_sizes = [150, 220, 290, 360, 430, 500] + size_map = {cat: base_sizes[i % len(base_sizes)] for i, cat in enumerate(cats)} + sizes = ms.astype("string").map(size_map).fillna(DEFAULT_SIZE).to_numpy(dtype=float) + else: + raise ValueError("marker_size must be None, '', a number, or a column name (str).") + + # --- categorical series (color & marker_style) + if color is None: + color_series = pd.Series(["All"] * len(df_plot), index=df_plot.index, dtype="string") + else: + color_series = df_plot[color].astype("string").fillna(other_label_color) + + if marker_style is None: + marker_series = pd.Series(["All"] * len(df_plot), index=df_plot.index, dtype="string") + else: + marker_series = df_plot[marker_style].astype("string").fillna(other_label_marker) + + # reduce categories (top-K) + if top_k_color is not None: + keep = set(color_series.value_counts().head(top_k_color).index) + color_series = color_series.where(color_series.isin(keep), other_label_color) + + if top_k_marker is not None: + keep = set(marker_series.value_counts().head(top_k_marker).index) + marker_series = marker_series.where(marker_series.isin(keep), other_label_marker) + + # final ordering + def order_categories(series, explicit_order): + counts = series.value_counts() + by_freq = list(counts.index) + if explicit_order: + seen, ordered = set(), [] + for c in explicit_order: + if c in counts.index and c not in seen: + ordered.append(c); seen.add(c) + for c in by_freq: + if c not in seen: + ordered.append(c); seen.add(c) + return ordered + return by_freq + + color_cats = order_categories(color_series, order_color) + marker_cats = order_categories(marker_series, order_marker) + + # palettes / marker shapes + if palette is None: + base = list(plt.get_cmap("tab20").colors) + repeats = (len(color_cats) + len(base) - 1) // len(base) + palette = (base * max(1, repeats))[:len(color_cats)] + else: + repeats = (len(color_cats) + len(palette) - 1) // len(palette) + palette = (list(palette) * max(1, repeats))[:len(color_cats)] + + if markers is None: + markers = ["o", "s", "^", "D", "P", "X", "v", "<", ">", "h", "H", "*", "p"] + else: + markers = list(markers) + + color_map = OrderedDict((cat, palette[i]) for i, cat in enumerate(color_cats)) + marker_map = OrderedDict((cat, markers[i % len(markers)]) for i, cat in enumerate(marker_cats)) + + # --- plot + fig, ax = plt.subplots(figsize=(8, 6), constrained_layout=False) + if legend_outside: + fig.subplots_adjust(right=0.78) # reserve space for legends on the right + + # robust bool masks (no pd.NA) + color_vals = color_series.astype("string") + marker_vals = marker_series.astype("string") + + for mcat in marker_cats: + m_mask = (marker_vals == mcat).fillna(False).to_numpy(dtype=bool) + for ccat in color_cats: + c_mask = (color_vals == ccat).fillna(False).to_numpy(dtype=bool) + mc_mask = np.logical_and(m_mask, c_mask) + if not np.any(mc_mask): + continue + ax.scatter( + x.values[mc_mask], y.values[mc_mask], + c=[color_map[ccat]], + marker=marker_map[mcat], + s=sizes[mc_mask], + alpha=0.75, + edgecolors="k", + linewidths=0.5, + ) + + # axes & limits + ax.set_xlabel(x_col) + ax.set_ylabel(y_col) + ax.set_title(f"Scatter (categorical): {y_col} vs {x_col}") + ax.grid(True, alpha=0.3) + + if len(x): + ax.set_xlim(x.min() - 0.5, x.max() + 0.5) + if len(y): + pad_y = max(1e-4, 0.02 * (y.max() - y.min())) + ax.set_ylim(y.min() - pad_y, y.max() + pad_y) + + # legends + color_handles = [ + plt.Line2D([0], [0], marker="o", color="none", + markerfacecolor=color_map[c], markeredgecolor="k", + markersize=8, linewidth=0) for c in color_cats + ] + color_legend = ax.legend( + color_handles, list(color_cats), + title=color if color else "", + ncol=legend_cols_color, + loc="center left" if legend_outside else "best", + bbox_to_anchor=(1.0, 0.5) if legend_outside else None, + frameon=True, + ) + ax.add_artist(color_legend) + + marker_handles = [ + plt.Line2D([0], [0], marker=marker_map[m], color="none", + markerfacecolor="lightgray", markeredgecolor="k", + markersize=8, linewidth=0) for m in marker_cats + ] + marker_legend = ax.legend( + marker_handles, list(marker_cats), + title=marker_style if marker_style else "", + ncol=legend_cols_marker, + loc="center left" if legend_outside else "best", + bbox_to_anchor=(1.0, 0.15) if legend_outside else None, + frameon=True, + ) + ax.add_artist(marker_legend) + + if show: + plt.show() + + return fig, ax + +def drawExample(): + df=df = pd.read_csv("D0_resolution.csv") + df.rename(columns={"production ID": "productionId"}, inplace=True) + + # + fig, ax = df_draw_scatter( + df, + "sigma:pTmin", + selection=lambda d: d["productionId"].str.contains(r"(LHC25b8a|LHC24)", regex=True, na=False), + color="productionId", + marker="centmin", + show=True + ) + # + fig, ax = df_draw_scatter_categorical( + df, "sigma:pTmin", + selection="productionId.str.contains(r'(?:LHC25b8a|LHC24|LHC25a5)', regex=True, na=False)", + color="productionId", + marker_style="centmin", + marker_size=100, # pt² + ) + fig.savefig("out.png", dpi=200, bbox_inches="tight") + ## + fig, ax = df_draw_scatter_categorical( + df, "sigma:pTmin", + selection="productionId.str.contains(r'(?:LHC24|LHC25a5)', regex=True, na=False)", + color="productionId", + marker_style="centmin", + marker_size=100, # pt² + ) + fig.savefig("resol_LHC24_LHC25a5.png", dpi=200) + + fig, ax = df_draw_scatter_categorical( + df, "sigma:pTmin", + selection="productionId.str.contains(r'(?:LHC25b8a|LHC24)', regex=True, na=False)", + color="productionId", + marker_style="centmin", + marker_size=100, # pt² + ) + fig.savefig("resol_LHC24_LHC25b8a.png", dpi=150, bbox_inches="tight") \ No newline at end of file diff --git a/UTILS/dfextensions/FormulaLinearModel.py b/UTILS/dfextensions/FormulaLinearModel.py new file mode 100644 index 000000000..2fb2edb89 --- /dev/null +++ b/UTILS/dfextensions/FormulaLinearModel.py @@ -0,0 +1,162 @@ + +""" FormulaLinearModel.py +import sys,os; sys.path.insert(1, os.environ[f"O2DPG"]+"/UTILS/dfextensions"); +from FormulaLinearModel import * +Utility helpers extension for FormulaLinearModel.py +""" + + +import ast +import numpy as np +from sklearn.linear_model import LinearRegression + +class FormulaLinearModel: + def __init__(self, name, formulas, target, precision=4, weight_formula=None, var_list=None): + """ + Formula-based linear regression model supporting code export. + + :param name: name of the model (used for function naming) + :param formulas: dict of {name: formula_string}, e.g., {'x1': 'v0*var00', 'x2': 'w1*var10'} + :param target: string expression for target variable, e.g., 'log(y)' or 'y' + :param precision: number of significant digits in code export (default: 4) + :param weight_formula: optional string formula for sample weights + :param var_list: optional list of variable names to fix the argument order for C++/JS export + + Example usage: + + >>> formulas = {'x1': 'v0*var00', 'x2': 'w1*var10'} + >>> model = FormulaLinearModel("myModel", formulas, target='y') + >>> model.fit(df) + >>> df['y_pred'] = model.predict(df) + >>> print(model.to_cpp()) + >>> print(model.to_pandas()) + >>> print(model.to_javascript()) + """ + self.name = name + self.formulas = formulas + self.target = target + self.precision = precision + self.weight_formula = weight_formula + self.model = LinearRegression() + self.feature_names = list(formulas.keys()) + + extracted_vars = self._extract_variables(from_formulas_only=True) + if var_list: + missing = set(extracted_vars) - set(var_list) + if missing: + raise ValueError(f"Provided var_list is missing variables: {missing}") + self.variables = var_list + else: + self.variables = sorted(extracted_vars) + + def _extract_variables(self, debug=False, from_formulas_only=False): + class VarExtractor(ast.NodeVisitor): + def __init__(self): + self.vars = set() + self.funcs = set() + + def visit_Name(self, node): + self.vars.add(node.id) + + def visit_Call(self, node): + if isinstance(node.func, ast.Name): + self.funcs.add(node.func.id) + self.generic_visit(node) + + extractor = VarExtractor() + if from_formulas_only: + all_exprs = list(self.formulas.values()) + else: + all_exprs = list(self.formulas.values()) + if self.weight_formula: + all_exprs.append(self.weight_formula) + if isinstance(self.target, str): + all_exprs.append(self.target) + + for expr in all_exprs: + tree = ast.parse(expr, mode='eval') + extractor.visit(tree) + + if debug: + print("Detected variables:", extractor.vars) + print("Detected functions:", extractor.funcs) + + return extractor.vars - extractor.funcs + + def fit(self, df): + X = np.column_stack([df.eval(expr) for expr in self.formulas.values()]) + y = df.eval(self.target) if isinstance(self.target, str) else df[self.target] + if self.weight_formula: + sample_weight = df.eval(self.weight_formula).values + self.model.fit(X, y, sample_weight=sample_weight) + else: + self.model.fit(X, y) + + def predict(self, df): + X = np.column_stack([df.eval(expr) for expr in self.formulas.values()]) + mask_valid = ~np.isnan(X).any(axis=1) + y_pred = np.full(len(df), np.nan) + y_pred[mask_valid] = self.model.predict(X[mask_valid]) + return y_pred + + def coef_dict(self): + return dict(zip(self.feature_names, self.model.coef_)), self.model.intercept_ + + def to_cpp(self): + fmt = f"{{0:.{self.precision}g}}" + coefs, intercept = self.coef_dict() + terms = [f"({fmt.format(coef)})*({self.formulas[name]})" for name, coef in coefs.items()] + expr = " + ".join(terms) + f" + ({fmt.format(intercept)})" + args = ", ".join([f"float {var}" for var in self.variables]) + return f"float {self.name}({args}) {{ return {expr}; }}" + + def to_pandas(self): + fmt = f"{{0:.{self.precision}g}}" + coefs, intercept = self.coef_dict() + terms = [f"({fmt.format(coef)})*({expr})" for expr, coef in zip(self.formulas.values(), coefs.values())] + return " + ".join(terms) + f" + ({fmt.format(intercept)})" + + def to_javascript(self): + fmt = f"{{0:.{self.precision}g}}" + coefs, intercept = self.coef_dict() + terms = [f"({fmt.format(coef)})*({self.formulas[name]})" for name, coef in coefs.items()] + expr = " + ".join(terms) + f" + ({fmt.format(intercept)})" + args = ", ".join(self.variables) + return f"function {self.name}({args}) {{ return {expr}; }}" + + def to_cppstd(self, name, variables, expression, precision=6): + args = ", ".join([f"const std::vector& {v}" for v in variables]) + output = [f"std::vector {name}(size_t n, {args}) {{"] + output.append(f" std::vector result(n);") + output.append(f" for (size_t i = 0; i < n; ++i) {{") + for v in variables: + output.append(f" float {v}_i = {v}[i];") + expr_cpp = expression + for v in variables: + expr_cpp = expr_cpp.replace(v, f"{v}_i") + output.append(f" result[i] = {expr_cpp};") + output.append(" }") + output.append(" return result;") + output.append("}") + return "\n".join(output) + + + def to_cpparrow(self, name, variables, expression, precision=6): + args = ", ".join([f"const arrow::FloatArray& {v}" for v in variables]) + output = [f"std::shared_ptr {name}(int64_t n, {args}, arrow::MemoryPool* pool) {{"] + output.append(f" arrow::FloatBuilder builder(pool);") + output.append(f" builder.Reserve(n);") + output.append(f" for (int64_t i = 0; i < n; ++i) {{") + expr_cpp = expression + for v in variables: + output.append(f" float {v}_i = {v}.Value(i);") + expr_cpp = expr_cpp.replace(v, f"{v}_i") + output.append(f" builder.UnsafeAppend({expr_cpp});") + output.append(" }") + output.append(" std::shared_ptr result;") + output.append(" builder.Finish(&result);") + output.append(" return result;") + output.append("}") + return "\n".join(output) + + diff --git a/UTILS/dfextensions/__init__.py b/UTILS/dfextensions/__init__.py new file mode 100644 index 000000000..bbef16072 --- /dev/null +++ b/UTILS/dfextensions/__init__.py @@ -0,0 +1,12 @@ +# __init__.py + +from .AliasDataFrame import AliasDataFrame +from .FormulaLinearModel import FormulaLinearModel +from .DataFrameUtils import * # if it provides general helper functions +from .groupby_regression import * # or other relevant functions + +__all__ = [ + "AliasDataFrame", + "FormulaLinearModel", + "GroupByRegressor" +] diff --git a/UTILS/dfextensions/bench_groupby_regression.py b/UTILS/dfextensions/bench_groupby_regression.py new file mode 100644 index 000000000..e412c7817 --- /dev/null +++ b/UTILS/dfextensions/bench_groupby_regression.py @@ -0,0 +1,338 @@ +#!/usr/bin/env python3 +""" +bench_groupby_regression.py — Single-file benchmark suite and reporter + +Scenarios covered (configurable via CLI): + 1) Clean baseline (serial & parallel) + 2) Outliers: 5% @ 3σ, 10% @ 5σ, 10% @ 10σ + 3) Group sizes: 5, 20, 100 rows/group + 4) n_jobs: 1, 4, 10 + 5) fitters: ols, robust, huber (if supported by implementation) + 6) sigmaCut: 3, 5, 10, 100 + +Outputs: + - Pretty text report + - JSON results (per scenario, with timing and configuration) + - Optional CSV summary + +Usage examples: + python3 bench_groupby_regression.py --quick + python3 bench_groupby_regression.py --rows 50000 --groups 10000 --out out_dir + python3 bench_groupby_regression.py --emit-csv + +Note: + This script expects 'groupby_regression.py' in PYTHONPATH or next to it and + uses GroupByRegressor.make_parallel_fit(...). See the wiring in _run_one(). +""" +from __future__ import annotations +import argparse, json, math, os, sys, time +from dataclasses import dataclass, asdict +from pathlib import Path +from typing import List, Dict, Any, Tuple + +import numpy as np +import pandas as pd + +# --- Import the project module --- +try: + import groupby_regression as gr + from groupby_regression import GroupByRegressor +except Exception as e: + print("[ERROR] Failed to import groupby_regression.py:", e, file=sys.stderr) + raise + +# --- Data Generators (Phase 1) --- +def _make_groups(n_rows: int, n_groups: int, rng: np.random.Generator) -> np.ndarray: + base = np.repeat(np.arange(n_groups, dtype=np.int32), n_rows // n_groups) + rem = n_rows - base.size + if rem > 0: + extra = rng.choice(n_groups, size=rem, replace=False) + base = np.concatenate([base, extra.astype(np.int32, copy=False)]) + rng.shuffle(base) + return base + +def _find_diag_col(df: pd.DataFrame, base: str, dp: str, suffix: str | None = None) -> str | None: + """ + Return diagnostics column for a given base (e.g. 'time_ms'), handling suffixes. + If suffix is provided, match startswith(dp+base) and endswith(suffix). + """ + exact = dp + base + if suffix is None and exact in df.columns: + return exact + pref = dp + base + for c in df.columns: + if not isinstance(c, str): + continue + if not c.startswith(pref): + continue + if suffix is not None and not c.endswith(suffix): + continue + return c + return None + + +def create_clean_data(n_rows: int, n_groups: int, *, seed: int = 42, noise_sigma: float = 1.0, x_corr: float = 0.0) -> pd.DataFrame: + rng = np.random.default_rng(seed) + group = _make_groups(n_rows, n_groups, rng) + mean = np.array([0.0, 0.0]) + cov = np.array([[1.0, x_corr], [x_corr, 1.0]]) + x = rng.multivariate_normal(mean, cov, size=n_rows, method="cholesky") + x1 = x[:, 0].astype(np.float32); x2 = x[:, 1].astype(np.float32) + eps = rng.normal(0.0, noise_sigma, size=n_rows).astype(np.float32) + y = (2.0 * x1 + 3.0 * x2 + eps).astype(np.float32) + df = pd.DataFrame({"group": group, "x1": x1, "x2": x2, "y": y}) + return df + +def create_data_with_outliers(n_rows: int, n_groups: int, *, outlier_pct: float = 0.10, outlier_magnitude: float = 5.0, + seed: int = 42, noise_sigma: float = 1.0, x_corr: float = 0.0) -> pd.DataFrame: + df = create_clean_data(n_rows, n_groups, seed=seed, noise_sigma=noise_sigma, x_corr=x_corr) + rng = np.random.default_rng(seed + 1337) + k = int(math.floor(outlier_pct * n_rows)) + if k > 0: + idx = rng.choice(n_rows, size=k, replace=False) + signs = rng.choice(np.array([-1.0, 1.0], dtype=np.float32), size=k, replace=True) + shift = (outlier_magnitude * noise_sigma * signs).astype(np.float32) + y = df["y"].to_numpy(copy=True) + y[idx] = (y[idx] + shift).astype(np.float32) + df["y"] = y + return df + +# --- Benchmark Plumbing --- +@dataclass +class Scenario: + name: str + outlier_pct: float + outlier_mag: float + rows_per_group: int + n_groups: int + n_jobs: int + fitter: str + sigmaCut: float + +def _run_one(df: pd.DataFrame, scenario: Scenario, args) -> Dict[str, Any]: + df = df.copy() + df["group2"] = df["group"].astype(np.int32) + df["weight"] = 1.0 + selection = pd.Series(True, index=df.index) + + t0 = time.perf_counter() + _, df_params = GroupByRegressor.make_parallel_fit( + df, + gb_columns=["group", "group2"], + fit_columns=["y"], + linear_columns=["x1", "x2"], + median_columns=[], + weights="weight", + suffix="_fit", + selection=selection, + addPrediction=False, + n_jobs=scenario.n_jobs, + min_stat=[3, 4], + sigmaCut=scenario.sigmaCut, + fitter=scenario.fitter, + batch_size="auto", + diag=getattr(args, "diag", False), + diag_prefix=getattr(args, "diag_prefix", "diag_"), + ) + dt = time.perf_counter() - t0 + n_groups_eff = int(df_params.shape[0]) + per_1k = dt / (n_groups_eff / 1000.0) if n_groups_eff else float("nan") + + return { + "scenario": scenario.name, + "config": { + "n_jobs": scenario.n_jobs, + "sigmaCut": scenario.sigmaCut, + "fitter": scenario.fitter, + "rows_per_group": scenario.rows_per_group, + "n_groups": scenario.n_groups, + "outlier_pct": scenario.outlier_pct, + "outlier_mag": scenario.outlier_mag, + }, + "result": { + "total_sec": dt, + "sec_per_1k_groups": per_1k, + "n_groups_effective": n_groups_eff, + }, + "df_params": df_params if getattr(args, "diag", False) else None, # <-- add this + } + +def _make_df(s: Scenario, seed: int = 7) -> pd.DataFrame: + n_rows = s.rows_per_group * s.n_groups + if s.outlier_pct > 0.0: + return create_data_with_outliers(n_rows, s.n_groups, outlier_pct=s.outlier_pct, outlier_magnitude=s.outlier_mag, seed=seed) + else: + return create_clean_data(n_rows, s.n_groups, seed=seed) + +def _format_report(rows: List[Dict[str, Any]]) -> str: + lines = [] + lines.append("=" * 64); lines.append("BENCHMARK: GroupBy Regression"); lines.append("=" * 64) + for r in rows: + cfg = r["config"]; res = r["result"] + lines.append("") + lines.append(f"Scenario: {r['scenario']}") + lines.append(f" Config: n_jobs={cfg['n_jobs']}, sigmaCut={cfg['sigmaCut']}, fitter={cfg['fitter']}") + lines.append(f" Data: {cfg['rows_per_group']*cfg['n_groups']:,} rows, {res['n_groups_effective']:,} groups (target {cfg['n_groups']:,}), ~{cfg['rows_per_group']} rows/group") + if cfg['outlier_pct']>0: + lines.append(f" Outliers: {cfg['outlier_pct']*100:.0f}% at {cfg['outlier_mag']}σ") + lines.append(f" Result: {res['total_sec']:.2f}s ({res['sec_per_1k_groups']:.2f}s per 1k groups)") + lines.append("") + return "\n".join(lines) + +def run_suite(args) -> Tuple[List[Dict[str, Any]], str, str, str | None]: + # Build scenarios + scenarios: List[Scenario] = [] + + # Baselines + scenarios.append(Scenario("Clean Data, Serial", 0.0, 0.0, args.rows_per_group, args.groups, 1, args.fitter, args.sigmaCut)) + if not args.serial_only: + scenarios.append(Scenario("Clean Data, Parallel", 0.0, 0.0, args.rows_per_group, args.groups, args.n_jobs, args.fitter, args.sigmaCut)) + + # Outlier sets + scenarios.append(Scenario("5% Outliers (3σ), Serial", 0.05, 3.0, args.rows_per_group, args.groups, 1, args.fitter, args.sigmaCut)) + scenarios.append(Scenario("10% Outliers (5σ), Serial", 0.10, 5.0, args.rows_per_group, args.groups, 1, args.fitter, args.sigmaCut)) + # High-outlier stress test + scenarios.append( + Scenario( + "30% Outliers (5σ), Serial", + 0.30, 5.0, + args.rows_per_group, + args.groups, + 1, + args.fitter, + args.sigmaCut, + ) + ) + if not args.serial_only: + scenarios.append( + Scenario( + "30% Outliers (5σ), Parallel", + 0.30, 5.0, + args.rows_per_group, + args.groups, + args.n_jobs, + args.fitter, + args.sigmaCut, + ) + ) + + if not args.serial_only: + scenarios.append(Scenario("10% Outliers (5σ), Parallel", 0.10, 5.0, args.rows_per_group, args.groups, args.n_jobs, args.fitter, args.sigmaCut)) + scenarios.append(Scenario("10% Outliers (10σ), Serial", 0.10, 10.0, args.rows_per_group, args.groups, 1, args.fitter, args.sigmaCut)) + + # Prepare output + out_dir = Path(args.out).resolve() + out_dir.mkdir(parents=True, exist_ok=True) + diag_rows=[] + human_summaries: List[Tuple[str, str]] = [] + # Run + results: List[Dict[str, Any]] = [] + for s in scenarios: + df = _make_df(s, seed=args.seed) + # PASS ARGS HERE + out = _run_one(df, s, args) + results.append(out) + if args.diag and out.get("df_params") is not None: + dfp = out["df_params"] + dp = args.diag_prefix + # Try to infer a suffix from any diag column (optional). If you know your suffix, set it via CLI later. + # For now we won’t guess; we’ll just use dp and allow both suffixed or unsuffixed. + + # 2a) Write top-10 violators per scenario + safe = (s.name.replace(" ", "_") + .replace("%","pct") + .replace("(","").replace(")","") + .replace("σ","sigma")) + tcol = _find_diag_col(dfp, "time_ms", dp) + if tcol: + dfp.sort_values(tcol, ascending=False).head(10).to_csv( + out_dir / f"diag_top10_time__{safe}.csv", index=False + ) + rcol = _find_diag_col(dfp, "n_refits", dp) + if rcol: + dfp.sort_values(rcol, ascending=False).head(10).to_csv( + out_dir / f"diag_top10_refits__{safe}.csv", index=False + ) + + # 2b) Class-level summary (machine + human) + summary = GroupByRegressor.summarize_diagnostics(dfp, diag_prefix=dp,diag_suffix="_fit") + summary_row = {"scenario": s.name, **summary} + diag_rows.append(summary_row) + human = GroupByRegressor.format_diagnostics_summary(summary) + human_summaries.append((s.name, human)) + if args.diag: + txt_path = out_dir / "benchmark_report.txt" + with open(txt_path, "a") as f: + f.write("\nDiagnostics summary (per scenario):\n") + for name, human in human_summaries: + f.write(f" - {name}: {human}\n") + f.write("\nTop-10 violators were saved per scenario as:\n") + f.write(" diag_top10_time__.csv, diag_top10_refits__.csv\n") + + + # Save + txt_path = out_dir / "benchmark_report.txt" + json_path = out_dir / "benchmark_results.json" + with open(txt_path, "w") as f: + f.write(_format_report(results)) + results_slim = [{k: v for k, v in r.items() if k != "df_params"} for r in results] + with open(json_path, "w") as f: + json.dump(results_slim, f, indent=2) + + csv_path = None + if args.emit_csv: + import csv + csv_path = out_dir / "benchmark_results.csv" + with open(csv_path, "w", newline="") as f: + w = csv.writer(f) + w.writerow(["scenario","n_jobs","sigmaCut","fitter","rows_per_group","n_groups","outlier_pct","outlier_mag","total_sec","sec_per_1k_groups","n_groups_effective"]) + for r in results: + cfg = r["config"]; res = r["result"] + w.writerow([r["scenario"], cfg["n_jobs"], cfg["sigmaCut"], cfg["fitter"], cfg["rows_per_group"], cfg["n_groups"], cfg["outlier_pct"], cfg["outlier_mag"], res["total_sec"], res["sec_per_1k_groups"], res["n_groups_effective"]]) + + # --- Append diagnostics summaries to the text report --- + if args.diag and 'human_summaries' in locals() and human_summaries: + with open(txt_path, "a") as f: + f.write("\nDiagnostics summary (per scenario):\n") + for name, human in human_summaries: + f.write(f" - {name}: {human}\n") + f.write("\nTop-10 violators saved as diag_top10_time__.csv " + "and diag_top10_refits__.csv\n") + + return results, str(txt_path), str(json_path), (str(csv_path) if csv_path else None) + +def parse_args(): + p = argparse.ArgumentParser(description="GroupBy Regression Benchmark Suite") + p.add_argument("--rows-per-group", type=int, default=5, help="Rows per group.") + p.add_argument("--groups", type=int, default=5000, help="Number of groups.") + p.add_argument("--n-jobs", type=int, default=4, help="Workers for parallel scenarios.") + p.add_argument("--sigmaCut", type=float, default=5.0, help="Sigma cut for robust fitting.") + p.add_argument("--fitter", type=str, default="ols", help="Fitter: ols|robust|huber depending on implementation.") + p.add_argument("--seed", type=int, default=7, help="Random seed.") + p.add_argument("--out", type=str, default="bench_out", help="Output directory.") + p.add_argument("--emit-csv", action="store_true", help="Also emit CSV summary.") + p.add_argument("--serial-only", action="store_true", help="Skip parallel scenarios.") + p.add_argument("--quick", action="store_true", help="Small quick run: groups=200.") + p.add_argument("--diag", action="store_true", + help="Collect per-group diagnostics into dfGB (diag_* columns).") + p.add_argument("--diag-prefix", type=str, default="diag_", + help="Prefix for diagnostic columns (default: diag_).") + + args = p.parse_args() + if args.quick: + args.groups = min(args.groups, 200) + return args + + + +def main(): + args = parse_args() + results, txt_path, json_path, csv_path = run_suite(args) + print(_format_report(results)) + print("\nSaved outputs:") + print(" -", txt_path) + print(" -", json_path) + if csv_path: print(" -", csv_path) + +if __name__ == "__main__": + main() diff --git a/UTILS/dfextensions/bench_out_quick/benchmark_report.txt b/UTILS/dfextensions/bench_out_quick/benchmark_report.txt new file mode 100644 index 000000000..492862367 --- /dev/null +++ b/UTILS/dfextensions/bench_out_quick/benchmark_report.txt @@ -0,0 +1,26 @@ +================================================================ +BENCHMARK: GroupBy Regression +================================================================ + +Scenario: Clean Data, Serial + Config: n_jobs=1, sigmaCut=5.0, fitter=ols + Data: 1,000 rows, 200 groups (target 200), ~5 rows/group + Result: 0.36s (1.78s per 1k groups) + +Scenario: 5% Outliers (3σ), Serial + Config: n_jobs=1, sigmaCut=5.0, fitter=ols + Data: 1,000 rows, 200 groups (target 200), ~5 rows/group + Outliers: 5% at 3.0σ + Result: 0.34s (1.72s per 1k groups) + +Scenario: 10% Outliers (5σ), Serial + Config: n_jobs=1, sigmaCut=5.0, fitter=ols + Data: 1,000 rows, 200 groups (target 200), ~5 rows/group + Outliers: 10% at 5.0σ + Result: 0.34s (1.71s per 1k groups) + +Scenario: 10% Outliers (10σ), Serial + Config: n_jobs=1, sigmaCut=5.0, fitter=ols + Data: 1,000 rows, 200 groups (target 200), ~5 rows/group + Outliers: 10% at 10.0σ + Result: 0.34s (1.71s per 1k groups) diff --git a/UTILS/dfextensions/diff.txt b/UTILS/dfextensions/diff.txt new file mode 100644 index 000000000..1b9adf90a --- /dev/null +++ b/UTILS/dfextensions/diff.txt @@ -0,0 +1,41 @@ +--- AliasDataFrame.py + ++++ AliasDataFrame.py + +@@ def export_tree(self, filename_or_file, treename="tree", dropAliasColumns=True,compression=uproot.LZMA(level=9),chunk_bytes=None): +- def export_tree(self, filename_or_file, treename="tree", dropAliasColumns=True,compression=uproot.LZMA(level=9),chunk_bytes=None): ++ def export_tree(self, filename_or_file, treename="tree", dropAliasColumns=True,compression=uproot.LZMA(level=9),chunk_bytes=None, basket_size=1024*1024): + is_path = isinstance(filename_or_file, str) + + if is_path: + with uproot.recreate(filename_or_file,compression=compression) as f: +- self._write_to_uproot(f, treename, dropAliasColumns, chunk_bytes=chunk_bytes) ++ self._write_to_uproot(f, treename, dropAliasColumns, chunk_bytes=chunk_bytes, basket_size=basket_size) + self._write_metadata_to_root(filename_or_file, treename) + else: +- self._write_to_uproot(filename_or_file, treename, dropAliasColumns) ++ self._write_to_uproot(filename_or_file, treename, dropAliasColumns, basket_size=basket_size) + + for subframe_name, entry in self._subframes.items(): +- entry["frame"]._write_metadata_to_root(filename_or_file, f"{treename}__subframe__{subframe_name}") ++ entry["frame"]._write_metadata_to_root(filename_or_file, f"{treename}__subframe__{subframe_name}") + +@@ def _write_to_uproot(self, uproot_file, treename, dropAliasColumns,chunk_bytes=None): +- def _write_to_uproot(self, uproot_file, treename, dropAliasColumns,chunk_bytes=None): ++ def _write_to_uproot(self, uproot_file, treename, dropAliasColumns,chunk_bytes=None, basket_size=1024*1024): + export_cols = [col for col in self.df.columns if not dropAliasColumns or col not in self.aliases] + dtype_casts = {col: np.float32 for col in export_cols if self.df[col].dtype == np.float16} + export_df = self.df[export_cols].astype(dtype_casts) + +- uproot_file[treename] = export_df ++ uproot_file.mktree( ++ treename, ++ {col: export_df[col].dtype for col in export_df.columns}, ++ basket_size=basket_size ++ ) ++ uproot_file[treename].extend(export_df) + + for subframe_name, entry in self._subframes.items(): +- entry["frame"].export_tree(uproot_file, f"{treename}__subframe__{subframe_name}", dropAliasColumns) ++ entry["frame"].export_tree(uproot_file, f"{treename}__subframe__{subframe_name}", dropAliasColumns, basket_size=basket_size) + diff --git a/UTILS/dfextensions/groupby_regression.md b/UTILS/dfextensions/groupby_regression.md new file mode 100644 index 000000000..707959fad --- /dev/null +++ b/UTILS/dfextensions/groupby_regression.md @@ -0,0 +1,316 @@ +# GroupBy Linear Regression Utilities + +This module provides utilities for computing group-wise linear fits and robust statistics on pandas DataFrames. It is designed to support workflows that require fitting separate models across grouped subsets of data. + +Originally developed for **distortion correction** and **dE/dx calibration** in high-energy physics experiments, the code has since been generalized to support broader applications involving grouped linear regression and statistical feature extraction. + +## Functions + +### `GroupByRegressor.make_linear_fit(...)` + +Performs group-wise **ordinary least squares (OLS)** regression fits. + +#### Parameters: + +* `df (pd.DataFrame)`: Input data +* `gb_columns (list[str])`: Columns to group by +* `fit_columns (list[str])`: Dependent (target) variables +* `linear_columns (list[str])`: Independent variables +* `median_columns (list[str])`: Columns for which medians are computed +* `suffix (str)`: Suffix for generated columns +* `selection (pd.Series)`: Boolean mask selecting rows to use +* `addPrediction (bool)`: If True, predictions are added to the original DataFrame +* `cast_dtype (str | None)`: Optional type casting (e.g., 'float32', 'float16') for fit results +* `min_stat (int)`: Minimum number of rows in a group to perform fitting + +#### Returns: + +* `(df_out, dfGB)`: + + * `df_out`: Original DataFrame with predictions (if enabled) + * `dfGB`: Per-group statistics, including slopes, intercepts, medians, and bin counts + +--- + +### `GroupByRegressor.make_parallel_fit(...)` + +Performs **robust group-wise regression** using `HuberRegressor`, with optional parallelization. + +#### Additional Parameters: + +* `weights (str)`: Column to use as weights during regression +* `n_jobs (int)`: Number of parallel processes to use +* `min_stat (list[int])`: Minimum number of points required for each predictor in `linear_columns` +* `sigmaCut (float)`: Threshold multiplier for MAD to reject outliers + +#### Notes: + +* Supports partial predictor exclusion per group based on `min_stat` +* Uses robust iteration with outlier rejection (MAD filtering) +* Falls back to NaNs when fits are ill-conditioned or predictors are skipped + +## Example + +```python +from groupby_regression import GroupByRegressor + +df_out, dfGB = GroupByRegressor.make_parallel_fit( + df, + gb_columns=['detector_sector'], + fit_columns=['dEdx'], + linear_columns=['path_length', 'momentum'], + median_columns=['path_length'], + weights='w_dedx', + suffix='_calib', + selection=(df['track_quality'] > 0.9), + cast_dtype='float32', + addPrediction=True, + min_stat=[20, 20], + n_jobs=4 +) +``` + +## Output Columns (in `dfGB`): + +| Column Name | Description | +| ----------------------------------------- | ---------------------------------------- | +| `_slope__` | Regression slope for predictor | +| `_intercept_` | Regression intercept | +| `_rms_` / `_mad_` | Residual stats (robust only) | +| `_` | Median of the specified column per group | +| `bin_count_` | Number of entries in each group | + +## Regression Flowchart + +```text ++-------------+ +| Input Data | ++------+------+ + | + v ++------+------+ +| Apply mask | +| (selection)| ++------+------+ + | + v ++----------------------------+ +| Group by gb_columns | ++----------------------------+ + | + v ++----------------------------+ +| For each group: | +| - Check min_stat | +| - Fit model | +| - Estimate residual stats | ++----------------------------+ + | + v ++-------------+ +-------------+ +| df_out | | dfGB | +| (with preds)| | (fit params)| ++-------------+ +-------------+ +``` + +## Use Cases + +* Detector distortion correction +* dE/dx signal calibration +* Grouped trend removal in sensor data +* Statistical correction of multi-source measurements + +## Test Coverage + +* Basic regression fit and prediction verification +* Edge case handling (missing data, small groups) +* Outlier injection and robust fit evaluation +* Exact recovery of known coefficients +* `cast_dtype` precision testing + +## Performance & Benchmarking + +### Overview + +To evaluate scaling and performance trade-offs, a dedicated benchmark tool is provided: + +```bash +python3 bench_groupby_regression.py \ + --rows-per-group 5 --groups 5000 \ + --n-jobs 10 --sigmaCut 5 --fitter ols \ + --out bench_out --emit-csv +``` + +Each run generates: + +* `benchmark_report.txt` – human-readable summary +* `benchmark_results.json` / `.csv` – structured outputs for analysis + + + +### Example Results (25k rows / 5k groups ≈ 5 rows/group) + +**Command** + +```bash +python3 bench_groupby_regression.py \ + --rows-per-group 5 --groups 5000 \ + --n-jobs 10 --sigmaCut 5 --fitter ols \ + --out bench_out --emit-csv +``` + +**Laptop (Mac):** + +| Scenario | Config | Result (s / 1k groups) | +| ------------------------------- | ------------------------- | ---------------------- | +| Clean Serial | n_jobs=1, sigmaCut=5, OLS | **1.69** | +| Clean Parallel (10) | n_jobs=10 | **0.50** | +| 5% Outliers (3σ), Serial | n_jobs=1 | **1.68** | +| 10% Outliers (5σ), Serial | n_jobs=1 | **1.67** | +| **30% Outliers (5σ), Serial** | n_jobs=1 | **1.66** | +| **30% Outliers (5σ), Parallel** | n_jobs=10 | **0.30** | +| 10% Outliers (10σ), Serial | n_jobs=1 | **1.67** | + +**Server (Linux, Apptainer):** + +| Scenario | Config | Result (s / 1k groups) | +| --------------------------- | ------------------------- | ---------------------- | +| Clean Serial | n_jobs=1, sigmaCut=5, OLS | **4.14** | +| Clean Parallel (10) | n_jobs=10 | **0.98** | +| 5% Outliers (3σ), Serial | n_jobs=1 | **4.03** | +| 10% Outliers (5σ), Serial | n_jobs=1 | **4.01** | +| 10% Outliers (5σ), Parallel | n_jobs=10 | **0.65** | +| 10% Outliers (10σ), Serial | n_jobs=1 | **4.01** | + +*Dataset:* synthetic (y = 2·x₁ + 3·x₂ + ε) + +#### High Outlier Fraction (Stress Test) + +Even at **30% response outliers**, runtime remains essentially unchanged (no robust re-fit triggered by sigmaCut). +To emulate worst-case slowdowns seen on real data, a **leverage-outlier** mode (X-contamination) will be added in a follow-up. + + +### Diagnostic Summary Utilities + +The regression framework can optionally emit per-group diagnostics when `diag=True` +is passed to `make_parallel_fit()`. + +Diagnostics include: + +| Field | Meaning | +|:------|:--------| +| `diag_time_ms` | Wall-time spent per group (ms) | +| `diag_n_refits` | Number of extra robust re-fits required | +| `diag_frac_rejected` | Fraction of rejected points after sigma-cut | +| `diag_cond_xtx` | Condition number proxy for design matrix | +| `diag_hat_max` | Maximum leverage in predictors | +| `diag_n_rows` | Number of rows in the group | + +Summaries can be generated directly: + +```python +summary = GroupByRegressor.summarize_diagnostics(dfGB, diag_prefix="diag_", suffix="_fit") +print(GroupByRegressor.format_diagnostics_summary(summary)) +``` + +### Interpretation + +* The **OLS path** scales linearly with group count. +* **Parallelization** provides 4–5× acceleration for thousands of small groups. +* Current synthetic *y‑only* outliers do **not** trigger re‑fitting overhead. +* Real‑data slowdowns (up to 25×) occur when **sigmaCut** forces iterative robust refits. + +### Recommendations + +| Use case | Suggested settings | +| ------------------------------ | ------------------------------------------------------- | +| Clean data | `sigmaCut=100` (disable refit), use `n_jobs≈CPU cores` | +| Moderate outliers | `sigmaCut=5–10`, enable parallelization | +| Heavy outliers (detector data) | Use `fitter='robust'` or `huber` and accept higher cost | +| Quick validation | `bench_groupby_regression.py --quick` | + +Here’s a concise, ready-to-paste paragraph you can drop directly **under the “Interpretation”** section in your `groupby_regression.md` file: + +--- + +### Cross-Platform Comparison (Mac vs Linux) + +Benchmark results on a Linux server (Apptainer, Python 3.11, joblib 1.4) show similar scaling but roughly **2–2.5 × longer wall-times** than on a MacBook (Pro/i7). +For the baseline case of 50 k rows / 10 k groups (~5 rows/group): + +| Scenario | Mac (s / 1 k groups) | Linux (s / 1 k groups) | Ratio (Linux / Mac) | +| --------------------------- | -------------------- | ---------------------- | ------------------- | +| Clean Serial | 1.75 | 3.98 | ≈ 2.3 × slower | +| Clean Parallel (10) | 0.41 | 0.78 | ≈ 1.9 × slower | +| 10 % Outliers (5 σ, Serial) | 1.77 | 4.01 | ≈ 2.3 × slower | + +Parallel efficiency on Linux (≈ 5 × speed-up from 1 → 10 jobs) matches the Mac results exactly. +The difference reflects platform-specific factors such as CPU frequency, BLAS implementation, and process-spawn overhead in Apptainer—not algorithmic changes. +Overall, **scaling behavior and outlier stability are identical across platforms.** + +--- + + + +### Future Work + +A future extension will introduce **leverage‑outlier** generation (outliers in X and Y) to replicate the observed 25× slowdown and allow comparative testing of different robust fitters. + +## Tips + +💡 Use `cast_dtype='float16'` for storage savings, but ensure it is compatible with downstream numerical precision requirements. + +### Usage Example for `cast_dtype` + +```python +import pandas as pd +import numpy as np +from dfextensions.groupby_regression import GroupByRegressor + +# Sample DataFrame +df = pd.DataFrame({ + 'group': ['A'] * 10 + ['B'] * 10, + 'x': np.linspace(0, 1, 20), + 'y': np.linspace(0, 2, 20) + np.random.normal(0, 0.1, 20), + 'weight': 1.0, +}) + +# Linear fit with casting to float32 +df_out, dfGB = GroupByRegressor.make_parallel_fit( + df, + gb_columns=['group'], + fit_columns=['y'], + linear_columns=['x'], + median_columns=['x'], + weights='weight', + suffix='_f32', + selection=df['x'].notna(), + cast_dtype='float32', + addPrediction=True +) + +# Check resulting data types +print(dfGB.dtypes) +``` + +### Output (Example) + +``` +group object +x_f32 float64 +y_slope_x_f32 float32 +y_err_x_f32 float32 +y_intercept_f32 float32 +y_rms_f32 float32 +y_mad_f32 float32 +bin_count_f32 int64 +dtype: object +``` + +## Recent Changes + +* ✅ Unified `min_stat` interface for both OLS and robust fits +* ✅ Type casting via `cast_dtype` param (e.g. `'float16'` for storage efficiency) +* ✅ Stable handling of singular matrices and small group sizes +* ✅ Test coverage for missing values, outliers, and exact recovery scenarios +* ✅ Logging replaces print-based diagnostics for cleaner integration diff --git a/UTILS/dfextensions/groupby_regression.py b/UTILS/dfextensions/groupby_regression.py new file mode 100644 index 000000000..cd4d10c3f --- /dev/null +++ b/UTILS/dfextensions/groupby_regression.py @@ -0,0 +1,672 @@ +import numpy as np +import pandas as pd +import logging +from sklearn.linear_model import LinearRegression, HuberRegressor +from joblib import Parallel, delayed +from numpy.linalg import inv, LinAlgError +from typing import Union, List, Tuple, Callable +from random import shuffle + +class GroupByRegressor: + @staticmethod + def _cast_fit_columns(dfGB: pd.DataFrame, cast_dtype: Union[str, None] = None) -> pd.DataFrame: + if cast_dtype is not None: + for col in dfGB.columns: + if ("slope" in col or "intercept" in col or "rms" in col or "mad" in col): + dfGB[col] = dfGB[col].astype(cast_dtype) + return dfGB + + @staticmethod + def make_linear_fit( + df: pd.DataFrame, + gb_columns: List[str], + fit_columns: List[str], + linear_columns: List[str], + median_columns: List[str], + suffix: str, + selection: pd.Series, + addPrediction: bool = False, + cast_dtype: Union[str, None] = None, + min_stat: int = 10 + ) -> Tuple[pd.DataFrame, pd.DataFrame]: + """ + Perform grouped ordinary least squares linear regression and compute medians. + + Parameters: + df (pd.DataFrame): Input dataframe. + gb_columns (List[str]): Columns to group by. + fit_columns (List[str]): Target columns for regression. + linear_columns (List[str]): Predictor columns. + median_columns (List[str]): Columns to compute median. + suffix (str): Suffix for output columns. + selection (pd.Series): Boolean mask to filter rows. + addPrediction (bool): If True, add predicted values to df. + cast_dtype (str|None): Data type to cast result coefficients. + min_stat (int): Minimum number of rows per group to perform regression. + + Returns: + Tuple[pd.DataFrame, pd.DataFrame]: (df with predictions, group-level regression results) + """ + df_selected = df.loc[selection] + group_results = [] + group_sizes = {} + groupby_key = gb_columns[0] if isinstance(gb_columns, (list, tuple)) and len(gb_columns) == 1 else gb_columns + + for key_vals, df_group in df_selected.groupby(groupby_key): + # Normalize group key to a tuple for consistent downstream usage + if isinstance(groupby_key, (list, tuple)): # multi-key groupby + key_tuple = key_vals # already a tuple + group_dict = dict(zip(gb_columns, key_vals)) + else: # single-key groupby + key_tuple = (key_vals,) # make it a tuple + group_dict = {gb_columns[0]: key_vals} + + # use the normalized tuple as the dict key to avoid surprises + group_sizes[key_tuple] = len(df_group) + + for target_col in fit_columns: + try: + X = df_group[linear_columns].values + y = df_group[target_col].values + if len(X) < min_stat: + for i, col in enumerate(linear_columns): + group_dict[f"{target_col}_slope_{col}"] = np.nan + group_dict[f"{target_col}_intercept"] = np.nan + continue + model = LinearRegression() + model.fit(X, y) + for i, col in enumerate(linear_columns): + group_dict[f"{target_col}_slope_{col}"] = model.coef_[i] + group_dict[f"{target_col}_intercept"] = model.intercept_ + except Exception as e: + logging.warning(f"Linear regression failed for {target_col} in group {group_vals}: {e}") + for col in linear_columns: + group_dict[f"{target_col}_slope_{col}"] = np.nan + group_dict[f"{target_col}_intercept"] = np.nan + + for col in median_columns: + group_dict[col] = df_group[col].median() + + group_results.append(group_dict) + + dfGB = pd.DataFrame(group_results) + dfGB = GroupByRegressor._cast_fit_columns(dfGB, cast_dtype) + + bin_counts = np.array([group_sizes.get(tuple(row), 0) for row in dfGB[gb_columns].itertuples(index=False)], dtype=np.int32) + dfGB["bin_count"] = bin_counts + dfGB = dfGB.rename(columns={col: f"{col}{suffix}" for col in dfGB.columns if col not in gb_columns}) + + if addPrediction: + df = df.merge(dfGB, on=gb_columns, how="left") + for target_col in fit_columns: + intercept_col = f"{target_col}_intercept{suffix}" + if intercept_col not in df.columns: + continue + df[f"{target_col}{suffix}"] = df[intercept_col] + for col in linear_columns: + slope_col = f"{target_col}_slope_{col}{suffix}" + if slope_col in df.columns: + df[f"{target_col}{suffix}"] += df[slope_col] * df[col] + + return df, dfGB + + @staticmethod + def process_group_robustBackup( + key: tuple, + df_group: pd.DataFrame, + gb_columns: List[str], + fit_columns: List[str], + linear_columns0: List[str], + median_columns: List[str], + weights: str, + minStat: List[int], + sigmaCut: float = 4, + fitter: Union[str, Callable] = "auto" + ) -> dict: + # TODO handle the case os single gb column + group_dict = dict(zip(gb_columns, key)) + predictors = [] + if isinstance(weights, str) and weights not in df_group.columns: + raise ValueError(f"Weight column '{weights}' not found in input DataFrame.") + + for i, col in enumerate(linear_columns0): + required_columns = [col] + fit_columns + [weights] + df_valid = df_group[required_columns].dropna() + if len(df_valid) >= minStat[i]: + predictors.append(col) + + for target_col in fit_columns: + try: + if not predictors: + continue + + subset_columns = predictors + [target_col, weights] + df_clean = df_group.dropna(subset=subset_columns) + + if len(df_clean) < min(minStat): + continue + + X = df_clean[predictors].values + y = df_clean[target_col].values + w = df_clean[weights].values + + model = None + if callable(fitter): + model = fitter() + elif fitter == "robust": + model = HuberRegressor(tol=1e-4) + elif fitter == "ols": + model = LinearRegression() + else: + model = HuberRegressor(tol=1e-4) + + try: + model.fit(X, y, sample_weight=w) + except Exception as e: + logging.warning(f"{model.__class__.__name__} failed for {target_col} in group {key}: {e}. Falling back to LinearRegression.") + model = LinearRegression() + model.fit(X, y, sample_weight=w) + + predicted = model.predict(X) + residuals = y - predicted + n, p = X.shape + denom = n - p if n > p else 1e-9 + s2 = np.sum(residuals ** 2) / denom + + try: + cov_matrix = inv(X.T @ X) * s2 + std_errors = np.sqrt(np.diag(cov_matrix)) + except LinAlgError: + std_errors = np.full(len(predictors), np.nan) + + rms = np.sqrt(np.mean(residuals ** 2)) + mad = np.median(np.abs(residuals)) + + mask = np.abs(residuals) <= sigmaCut * mad + if mask.sum() >= min(minStat): + try: + model.fit(X[mask], y[mask], sample_weight=w[mask]) + except Exception as e: + logging.warning(f"{model.__class__.__name__} re-fit with outlier mask failed for {target_col} in group {key}: {e}. Falling back to LinearRegression.") + model = LinearRegression() + model.fit(X[mask], y[mask], sample_weight=w[mask]) + + predicted = model.predict(X) + residuals = y - predicted + rms = np.sqrt(np.mean(residuals ** 2)) + mad = np.median(np.abs(residuals)) + + for col in linear_columns0: + if col in predictors: + idx = predictors.index(col) + group_dict[f"{target_col}_slope_{col}"] = model.coef_[idx] + group_dict[f"{target_col}_err_{col}"] = std_errors[idx] if idx < len(std_errors) else np.nan + else: + group_dict[f"{target_col}_slope_{col}"] = np.nan + group_dict[f"{target_col}_err_{col}"] = np.nan + + group_dict[f"{target_col}_intercept"] = model.intercept_ + group_dict[f"{target_col}_rms"] = rms + group_dict[f"{target_col}_mad"] = mad + except Exception as e: + logging.warning(f"Robust regression failed for {target_col} in group {key}: {e}") + for col in linear_columns0: + group_dict[f"{target_col}_slope_{col}"] = np.nan + group_dict[f"{target_col}_err_{col}"] = np.nan + group_dict[f"{target_col}_intercept"] = np.nan + group_dict[f"{target_col}_rms"] = np.nan + group_dict[f"{target_col}_mad"] = np.nan + + for col in median_columns: + group_dict[col] = df_group[col].median() + + return group_dict + + + @staticmethod + def process_group_robust( + key: tuple, + df_group: pd.DataFrame, + gb_columns: List[str], + fit_columns: List[str], + linear_columns0: List[str], + median_columns: List[str], + weights: str, + minStat: List[int], + sigmaCut: float = 4, + fitter: Union[str, Callable] = "auto", + # --- NEW (optional) diagnostics --- + diag: bool = False, + diag_prefix: str = "diag_", + ) -> dict: + """ + Per-group robust/OLS fit with optional diagnostics. + + Diagnostics (only when diag=True; added once per group into the result dict): + - {diag_prefix}n_refits : int, number of extra fits after the initial one (0 or 1 in this implementation) + - {diag_prefix}frac_rejected : float, fraction rejected by sigmaCut at final mask + - {diag_prefix}hat_max : float, max leverage proxy via QR (max rowwise ||Q||^2) + - {diag_prefix}cond_xtx : float, condition number of X^T X + - {diag_prefix}time_ms : float, wall-time per group (ms) excluding leverage/cond computation + - {diag_prefix}n_rows : int, number of rows in the group (after dropna for predictors/target/weights) + + Notes: + - n_refits counts *additional* iterations beyond the first fit. With this one-pass sigmaCut scheme, + it will be 0 (no re-fit) or 1 (re-fit once on inliers). + """ + import time + import numpy as np + import logging + from sklearn.linear_model import HuberRegressor, LinearRegression + + # TODO handle the case of single gb column + group_dict = dict(zip(gb_columns, key)) + + if isinstance(weights, str) and weights not in df_group.columns: + raise ValueError(f"Weight column '{weights}' not found in input DataFrame.") + + # Select predictors that meet per-predictor minStat (based on non-null rows with target+weights) + predictors: List[str] = [] + for i, col in enumerate(linear_columns0): + required_columns = [col] + fit_columns + [weights] + df_valid = df_group[required_columns].dropna() + if len(df_valid) >= minStat[i]: + predictors.append(col) + + # Prepare diagnostics state (group-level) + n_refits_group = 0 # extra fits after initial fit + frac_rejected_group = np.nan + hat_max_group = np.nan + cond_xtx_group = np.nan + time_ms_group = np.nan + n_rows_group = int(len(df_group)) # raw group size (will refine to cleaned size later) + + # Start timing the *fitting* work (we will stop before leverage/cond to avoid polluting time) + t0_group = time.perf_counter() + + # Loop over target columns + for target_col in fit_columns: + try: + if not predictors: + # No valid predictors met minStat; emit NaNs for this target + for col in linear_columns0: + group_dict[f"{target_col}_slope_{col}"] = np.nan + group_dict[f"{target_col}_err_{col}"] = np.nan + group_dict[f"{target_col}_intercept"] = np.nan + group_dict[f"{target_col}_rms"] = np.nan + group_dict[f"{target_col}_mad"] = np.nan + continue + + subset_columns = predictors + [target_col, weights] + df_clean = df_group.dropna(subset=subset_columns) + if len(df_clean) < min(minStat): + # Not enough rows to fit + for col in linear_columns0: + group_dict[f"{target_col}_slope_{col}"] = np.nan + group_dict[f"{target_col}_err_{col}"] = np.nan + group_dict[f"{target_col}_intercept"] = np.nan + group_dict[f"{target_col}_rms"] = np.nan + group_dict[f"{target_col}_mad"] = np.nan + continue + + # Update cleaned group size for diagnostics + n_rows_group = int(len(df_clean)) + + X = df_clean[predictors].to_numpy(copy=False) + y = df_clean[target_col].to_numpy(copy=False) + w = df_clean[weights].to_numpy(copy=False) + + # Choose model + if callable(fitter): + model = fitter() + elif fitter == "robust": + model = HuberRegressor(tol=1e-4) + elif fitter == "ols": + model = LinearRegression() + else: + model = HuberRegressor(tol=1e-4) + + # Initial fit + try: + model.fit(X, y, sample_weight=w) + except Exception as e: + logging.warning( + f"{model.__class__.__name__} failed for {target_col} in group {key}: {e}. " + f"Falling back to LinearRegression." + ) + model = LinearRegression() + model.fit(X, y, sample_weight=w) + + # Residuals and robust stats + predicted = model.predict(X) + residuals = y - predicted + rms = float(np.sqrt(np.mean(residuals ** 2))) + mad = float(np.median(np.abs(residuals))) + + # One-pass sigmaCut masking (current implementation supports at most a single re-fit) + final_mask = None + if np.isfinite(mad) and mad > 0 and sigmaCut is not None and sigmaCut < np.inf: + mask = (np.abs(residuals) <= sigmaCut * mad) + if mask.sum() >= min(minStat): + # Re-fit on inliers + n_refits_group += 1 # <-- counts *extra* fits beyond the first + try: + model.fit(X[mask], y[mask], sample_weight=w[mask]) + except Exception as e: + logging.warning( + f"{model.__class__.__name__} re-fit with outlier mask failed for {target_col} " + f"in group {key}: {e}. Falling back to LinearRegression." + ) + model = LinearRegression() + model.fit(X[mask], y[mask], sample_weight=w[mask]) + + # Recompute residuals on full X (to report global rms/mad) + predicted = model.predict(X) + residuals = y - predicted + rms = float(np.sqrt(np.mean(residuals ** 2))) + mad = float(np.median(np.abs(residuals))) + final_mask = mask + else: + final_mask = np.ones_like(residuals, dtype=bool) + else: + final_mask = np.ones_like(residuals, dtype=bool) + + # Parameter errors from final fit (on the design actually used to fit) + try: + if final_mask is not None and final_mask.any(): + X_used = X[final_mask] + y_used = y[final_mask] + else: + X_used = X + y_used = y + + n, p = X_used.shape + denom = n - p if n > p else 1e-9 + s2 = float(np.sum((y_used - model.predict(X_used)) ** 2) / denom) + cov_matrix = np.linalg.inv(X_used.T @ X_used) * s2 + std_errors = np.sqrt(np.diag(cov_matrix)) + except np.linalg.LinAlgError: + std_errors = np.full(len(predictors), np.nan, dtype=float) + + # Store results for this target + for col in linear_columns0: + if col in predictors: + idx = predictors.index(col) + group_dict[f"{target_col}_slope_{col}"] = float(model.coef_[idx]) + group_dict[f"{target_col}_err_{col}"] = float(std_errors[idx]) if idx < len(std_errors) else np.nan + else: + group_dict[f"{target_col}_slope_{col}"] = np.nan + group_dict[f"{target_col}_err_{col}"] = np.nan + + group_dict[f"{target_col}_intercept"] = float(model.intercept_) if hasattr(model, "intercept_") else np.nan + group_dict[f"{target_col}_rms"] = rms + group_dict[f"{target_col}_mad"] = mad + + # Update group-level diagnostics that depend on the final mask + if diag: + # Capture timing up to here (pure fitting + residuals + errors); exclude leverage/cond below + time_ms_group = (time.perf_counter() - t0_group) * 1e3 + if final_mask is not None and len(final_mask) > 0: + frac_rejected_group = 1.0 - (float(np.count_nonzero(final_mask)) / float(len(final_mask))) + else: + frac_rejected_group = np.nan + + except Exception as e: + logging.warning(f"Robust regression failed for {target_col} in group {key}: {e}") + for col in linear_columns0: + group_dict[f"{target_col}_slope_{col}"] = np.nan + group_dict[f"{target_col}_err_{col}"] = np.nan + group_dict[f"{target_col}_intercept"] = np.nan + group_dict[f"{target_col}_rms"] = np.nan + group_dict[f"{target_col}_mad"] = np.nan + + # Medians + for col in median_columns: + try: + group_dict[col] = df_group[col].median() + except Exception: + group_dict[col] = np.nan + + # Compute leverage & conditioning proxies (kept OUTSIDE the timed span) + if diag: + try: + X_cols = [c for c in linear_columns0 if c in df_group.columns and c in predictors] + if X_cols: + X_diag = df_group[X_cols].dropna().to_numpy(dtype=np.float64, copy=False) + else: + X_diag = None + + hat_max_group = np.nan + cond_xtx_group = np.nan + if X_diag is not None and X_diag.size and X_diag.shape[1] > 0: + # cond(X^T X) + try: + s = np.linalg.svd(X_diag.T @ X_diag, compute_uv=False) + cond_xtx_group = float(s[0] / s[-1]) if (s.size > 0 and s[-1] > 0) else float("inf") + except Exception: + cond_xtx_group = float("inf") + # leverage via QR + try: + Q, _ = np.linalg.qr(X_diag, mode="reduced") + hat_max_group = float(np.max(np.sum(Q * Q, axis=1))) + except Exception: + pass + except Exception: + pass + + # Attach diagnostics (once per group) + group_dict[f"{diag_prefix}n_refits"] = int(n_refits_group) + group_dict[f"{diag_prefix}frac_rejected"] = float(frac_rejected_group) if np.isfinite(frac_rejected_group) else np.nan + group_dict[f"{diag_prefix}hat_max"] = float(hat_max_group) if np.isfinite(hat_max_group) else np.nan + group_dict[f"{diag_prefix}cond_xtx"] = float(cond_xtx_group) if np.isfinite(cond_xtx_group) else np.nan + group_dict[f"{diag_prefix}time_ms"] = float(time_ms_group) if np.isfinite(time_ms_group) else np.nan + group_dict[f"{diag_prefix}n_rows"] = int(n_rows_group) + + return group_dict + + @staticmethod + def make_parallel_fit( + df: pd.DataFrame, + gb_columns: List[str], + fit_columns: List[str], + linear_columns: List[str], + median_columns: List[str], + weights: str, + suffix: str, + selection: pd.Series, + addPrediction: bool = False, + cast_dtype: Union[str, None] = None, + n_jobs: int = 1, + min_stat: List[int] = [10, 10], + sigmaCut: float = 4.0, + fitter: Union[str, Callable] = "auto", + batch_size: Union[int, None] = "auto", # ← new argument + # --- NEW: diagnostics switch --- + diag: bool = False, + diag_prefix: str = "diag_" + ) -> Tuple[pd.DataFrame, pd.DataFrame]: + """ + Perform grouped robust linear regression using HuberRegressor in parallel. + + Parameters: + df (pd.DataFrame): Input dataframe. + gb_columns (List[str]): Columns to group by. + fit_columns (List[str]): Target columns for regression. + linear_columns (List[str]): Predictor columns. + median_columns (List[str]): Columns to compute medians. + weights (str): Column name of weights for fitting. + suffix (str): Suffix to append to output columns. + selection (pd.Series): Boolean selection mask. + addPrediction (bool): If True, add prediction columns to df. + cast_dtype (Union[str, None]): Optional dtype cast for fit outputs. + n_jobs (int): Number of parallel jobs. + min_stat (List[int]): Minimum number of rows required to use each predictor. + sigmaCut (float): Outlier threshold in MAD units. + + Returns: + Tuple[pd.DataFrame, pd.DataFrame]: DataFrame with predictions and group-level statistics. + """ + if isinstance(weights, str) and weights not in df.columns: + raise ValueError(f"Weight column '{weights}' not found in input DataFrame") + + df_selected = df.loc[selection] + grouped = df_selected.groupby(gb_columns) + + filtered_items = [(key, idxs) for key, idxs in grouped.groups.items() if len(idxs) >= min_stat[0]/2] + # shuffle(filtered_items) # Shuffle to ensure random order in parallel processing - should be an option + + results = Parallel(n_jobs=n_jobs,batch_size=batch_size)( + delayed(GroupByRegressor.process_group_robust)( + key, df_selected.loc[idxs], gb_columns, fit_columns, linear_columns, + median_columns, weights, min_stat, sigmaCut, fitter, + diag=diag, # <-- pass through + diag_prefix=diag_prefix, # <-- pass through + ) + for key, idxs in filtered_items + ) + + dfGB = pd.DataFrame(results) + dfGB = GroupByRegressor._cast_fit_columns(dfGB, cast_dtype) + + bin_counts = np.array([ + len(grouped.get_group(key)) if key in grouped.groups else 0 + for key in dfGB[gb_columns].itertuples(index=False, name=None) + ], dtype=np.int32) + dfGB["bin_count"] = bin_counts + dfGB = dfGB.rename(columns={col: f"{col}{suffix}" for col in dfGB.columns if col not in gb_columns}) + + if addPrediction: + df = df.merge(dfGB, on=gb_columns, how="left") + for target_col in fit_columns: + intercept_col = f"{target_col}_intercept{suffix}" + if intercept_col not in df.columns: + continue + df[f"{target_col}{suffix}"] = df[intercept_col] + for col in linear_columns: + slope_col = f"{target_col}_slope_{col}{suffix}" + if slope_col in df.columns: + df[f"{target_col}{suffix}"] += df[slope_col] * df[col] + + return df, dfGB + + + + def summarize_diagnostics_top(dfGB, diag_prefix: str = "diag_", suffix="", top: int = 10): + """ + Quick look at diagnostic columns emitted by make_parallel_fit(..., diag=True). + Returns a dict of small DataFrames for top offenders, and prints a short summary. + + Example: + summ = summarize_diagnostics(dfGB, top=20) + summ["slowest"].head() + """ + import pandas as pd + cols = { + "time": f"{diag_prefix}time_ms{suffix}", + "refits": f"{diag_prefix}n_refits{suffix}", + "rej": f"{diag_prefix}frac_rejected{suffix}", + "lev": f"{diag_prefix}hat_max{suffix}", + "cond": f"{diag_prefix}cond_xtx{suffix}", + "nrows": f"{diag_prefix}n_rows{suffix}", + } + missing = [c for c in cols.values() if c not in dfGB.columns] + if missing: + print("[diagnostics] Missing columns (did you run diag=True?):", missing) + return {} + + summary = {} + # Defensive: numeric coerce + d = dfGB.copy() + for k, c in cols.items(): + d[c] = pd.to_numeric(d[c], errors="coerce") + + summary["slowest"] = d.sort_values(cols["time"], ascending=False).head(top)[list({*dfGB.columns[:len(dfGB.columns)//4], *cols.values()})] + summary["most_refits"] = d.sort_values(cols["refits"], ascending=False).head(top)[list({*dfGB.columns[:len(dfGB.columns)//4], *cols.values()})] + summary["most_rejected"] = d.sort_values(cols["rej"], ascending=False).head(top)[list({*dfGB.columns[:len(dfGB.columns)//4], *cols.values()})] + summary["highest_leverage"] = d.sort_values(cols["lev"], ascending=False).head(top)[list({*dfGB.columns[:len(dfGB.columns)//4], *cols.values()})] + summary["worst_conditioned"] = d.sort_values(cols["cond"], ascending=False).head(top)[list({*dfGB.columns[:len(dfGB.columns)//4], *cols.values()})] + + # Console summary + print("[diagnostics] Groups:", len(dfGB)) + print("[diagnostics] mean time (ms):", float(d[cols["time"]].mean())) + print("[diagnostics] pct with refits>0:", float((d[cols["refits"]] > 0).mean()) * 100.0) + print("[diagnostics] mean frac_rejected:", float(d[cols["rej"]].mean())) + print("[diagnostics] 99p cond_xtx:", float(d[cols["cond"]].quantile(0.99))) + print("[diagnostics] 99p hat_max:", float(d[cols["lev"]].quantile(0.99))) + return summary + + @staticmethod + def summarize_diagnostics( + dfGB: "pd.DataFrame", + diag_prefix: str = "diag_", + diag_suffix: str = "", + quantiles: tuple[float, ...] = (0.50, 0.90, 0.95, 0.99), + ) -> dict: + """ + Aggregate per-group diagnostics emitted by make_parallel_fit(..., diag=True). + Returns a plain dict with mean/median/std and selected quantiles for: + - time_ms, frac_rejected, n_refits, cond_xtx, hat_max, n_rows + """ + def _col(base: str): + exact = f"{diag_prefix}{base}" + if exact in dfGB.columns: + return exact + # tolerate suffixing like diag_time_ms_fit + pref = f"{diag_prefix}{base}{diag_suffix}" + for c in dfGB.columns: + if isinstance(c, str) and c.startswith(pref): + return c + return None + + cols = { + "time_ms": _col("time_ms"), + "frac_rejected": _col("frac_rejected"), + "n_refits": _col("n_refits"), + "cond_xtx": _col("cond_xtx"), + "hat_max": _col("hat_max"), + "n_rows": _col("n_rows"), + } + + out: dict = {"groups": int(len(dfGB)), "diag_prefix": diag_prefix} + for name, col in cols.items(): + if not col or col not in dfGB.columns: + continue + s = pd.to_numeric(dfGB[col], errors="coerce") + if name == "cond_xtx": + s = s.replace([np.inf, -np.inf], np.nan) + s = s.dropna() + if s.empty: + continue + out[f"{name}_mean"] = float(s.mean()) + out[f"{name}_median"] = float(s.median()) + out[f"{name}_std"] = float(s.std(ddof=1)) if len(s) > 1 else 0.0 + for q in quantiles: + out[f"{name}_p{int(q*100)}"] = float(s.quantile(q)) + if name == "n_refits": + out["pct_refits_gt0"] = float((s > 0).mean() * 100.0) + return out + + + @staticmethod + def format_diagnostics_summary(summary: dict) -> str: + """ + Pretty, single-paragraph human summary from summarize_diagnostics(..) output. + Safe to print or append to reports. + """ + if not summary or "groups" not in summary: + return "Diagnostics: no data." + def g(k, default="nan"): + v = summary.get(k, None) + return f"{v:.3f}" if isinstance(v, (int, float)) else default + + lines = [] + lines.append( + f"Diagnostics over {summary['groups']} groups — " + f"time_ms p50/p95/p99={g('time_ms_p50')}/{g('time_ms_p95')}/{g('time_ms_p99')}, " + f"mean={g('time_ms_mean')}, std={g('time_ms_std')}; " + f"frac_rejected mean={g('frac_rejected_mean')}, p95={g('frac_rejected_p95')}, p99={g('frac_rejected_p99')}; " + f"refits>0={g('pct_refits_gt0')}% ; " + f"cond_xtx p99={g('cond_xtx_p99')}, hat_max p99={g('hat_max_p99')}." + ) + return lines[0] + diff --git a/UTILS/dfextensions/quantile_fit_nd/.gitignore b/UTILS/dfextensions/quantile_fit_nd/.gitignore new file mode 100644 index 000000000..46c1247fe --- /dev/null +++ b/UTILS/dfextensions/quantile_fit_nd/.gitignore @@ -0,0 +1,5 @@ +.idea/ +.vscode/ +__pycache__/ +*.py[cod] +.DS_Store diff --git a/UTILS/dfextensions/quantile_fit_nd/__init__.py b/UTILS/dfextensions/quantile_fit_nd/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/UTILS/dfextensions/quantile_fit_nd/bench.log b/UTILS/dfextensions/quantile_fit_nd/bench.log new file mode 100644 index 000000000..93e826fcc --- /dev/null +++ b/UTILS/dfextensions/quantile_fit_nd/bench.log @@ -0,0 +1,27 @@ +Distributions: uniform, poisson, gaussian (Poisson uses PIT, λ=50.0) +q_step=0.025, dq=0.05, z_bins=20, sample_fraction=0.006 + +=== Benchmark: uniform === +N= 2000 | t_fit=0.049s | rms_b=2.41368 (rms_b*√N=107.94303) | σQ_rel=0.146 | rt_rms=0.01513 (rt_rms*√N=0.67650) | z_inf=20 | mem=nanMB +N= 5000 | t_fit=0.056s | rms_b=0.55985 (rms_b*√N=39.58722) | σQ_rel=0.066 | rt_rms=0.01024 (rt_rms*√N=0.72411) | z_inf=20 | mem=nanMB +N= 10000 | t_fit=0.059s | rms_b=0.32056 (rms_b*√N=32.05619) | σQ_rel=0.068 | rt_rms=0.01047 (rt_rms*√N=1.04684) | z_inf=20 | mem=nanMB +N= 20000 | t_fit=0.064s | rms_b=0.25235 (rms_b*√N=35.68710) | σQ_rel=0.053 | rt_rms=0.00986 (rt_rms*√N=1.39477) | z_inf=20 | mem=nanMB +N= 50000 | t_fit=0.080s | rms_b=0.15938 (rms_b*√N=35.63915) | σQ_rel=0.060 | rt_rms=0.01008 (rt_rms*√N=2.25341) | z_inf=20 | mem=nanMB +=== Benchmark: poisson === +N= 2000 | t_fit=0.048s | rms_b=1.08724 (rms_b*√N=48.62280) | σQ_rel=0.099 | rt_rms=0.01480 (rt_rms*√N=0.66168) | z_inf=19 | mem=nanMB +N= 5000 | t_fit=0.056s | rms_b=0.42197 (rms_b*√N=29.83781) | σQ_rel=0.085 | rt_rms=0.01021 (rt_rms*√N=0.72211) | z_inf=20 | mem=nanMB +N= 10000 | t_fit=0.059s | rms_b=0.30544 (rms_b*√N=30.54434) | σQ_rel=0.074 | rt_rms=0.01037 (rt_rms*√N=1.03662) | z_inf=20 | mem=nanMB +N= 20000 | t_fit=0.065s | rms_b=0.27920 (rms_b*√N=39.48468) | σQ_rel=0.057 | rt_rms=0.00977 (rt_rms*√N=1.38106) | z_inf=20 | mem=nanMB +N= 50000 | t_fit=0.083s | rms_b=0.16595 (rms_b*√N=37.10819) | σQ_rel=0.064 | rt_rms=0.00996 (rt_rms*√N=2.22747) | z_inf=20 | mem=nanMB +=== Benchmark: gaussian === +N= 2000 | t_fit=0.048s | rms_b=3.60489 (rms_b*√N=161.21578) | σQ_rel=0.137 | rt_rms=0.00984 (rt_rms*√N=0.43992) | z_inf=20 | mem=nanMB +N= 5000 | t_fit=0.056s | rms_b=0.57066 (rms_b*√N=40.35166) | σQ_rel=0.073 | rt_rms=0.00845 (rt_rms*√N=0.59739) | z_inf=20 | mem=nanMB +N= 10000 | t_fit=0.059s | rms_b=0.36336 (rms_b*√N=36.33622) | σQ_rel=0.046 | rt_rms=0.00971 (rt_rms*√N=0.97096) | z_inf=20 | mem=nanMB +N= 20000 | t_fit=0.065s | rms_b=0.31920 (rms_b*√N=45.14134) | σQ_rel=0.071 | rt_rms=0.00945 (rt_rms*√N=1.33642) | z_inf=20 | mem=nanMB +N= 50000 | t_fit=0.081s | rms_b=0.13731 (rms_b*√N=30.70329) | σQ_rel=0.055 | rt_rms=0.01000 (rt_rms*√N=2.23535) | z_inf=20 | mem=nanMB + +=== Scaling summary (expect: α_b ≈ -0.5, α_rt ≈ 0.0) === +uniform | α_b=-0.802 (→ -0.5) | α_rt=-0.111 (→ 0.0) | mean(rms_b√N)=50.18254 +poisson | α_b=-0.539 (→ -0.5) | α_rt=-0.109 (→ 0.0) | mean(rms_b√N)=37.11956 +gaussian | α_b=-0.922 (→ -0.5) | α_rt= 0.017 (→ 0.0) | mean(rms_b√N)=62.74966 +Saved PNG plots: bench_scaling_uniform.png, bench_scaling_poisson.png, bench_scaling_gaussian.png diff --git a/UTILS/dfextensions/quantile_fit_nd/bench_quantile_fit_nd.py b/UTILS/dfextensions/quantile_fit_nd/bench_quantile_fit_nd.py new file mode 100644 index 000000000..45f96f539 --- /dev/null +++ b/UTILS/dfextensions/quantile_fit_nd/bench_quantile_fit_nd.py @@ -0,0 +1,336 @@ +#!/usr/bin/env python3 +# dfextensions/quantile_fit_nd/bench_quantile_fit_nd.py +""" +Benchmark speed + precision for fit_quantile_linear_nd with scaling checks. + +- Distributions: uniform, poisson (via randomized PIT), gaussian +- q_centers step = 0.025; dq = 0.05 (more points per z-bin) +- Precision metrics per N: + * rms_b := sqrt(mean( (b_meas(z) - b_exp(z))^2 )) over informative z-bins + * rel_err_sigmaQ := median relative error of sigma_Q vs truth per z-bin + * rms_rt := round-trip inversion RMS for a random subset of events +- Scaling check: + * expect alpha_b ≈ -0.5 (rms_b ∝ N^{-1/2}) + * expect alpha_rt ≈ 0.0 (rms_rt roughly flat; per-event noise) +- Prints E*sqrt(N) for rms_b as a constancy sanity check. +- Optional: PNG plots (log-log), CSV export, memory profiling, strict assertions. +""" + +import argparse +import json +import warnings +from math import erf, sqrt +import time +import numpy as np +import pandas as pd + +from dfextensions.quantile_fit_nd.quantile_fit_nd import ( + fit_quantile_linear_nd, + QuantileEvaluator, +) +from dfextensions.quantile_fit_nd.utils import discrete_to_uniform_rank_poisson + +RNG = np.random.default_rng(123456) + + +# ----------------------- Synthetic data generation ----------------------- + +def gen_Q_from_distribution(dist: str, n: int, *, lam: float) -> np.ndarray: + if dist == "uniform": + return RNG.uniform(0.0, 1.0, size=n) + elif dist == "poisson": + k = RNG.poisson(lam, size=n) + return discrete_to_uniform_rank_poisson(k, lam, mode="randomized", rng=RNG) + elif dist == "gaussian": + z = RNG.normal(0.0, 1.0, size=n) # standard normal + cdf = 0.5 * (1.0 + np.array([erf(zi / np.sqrt(2)) for zi in z])) + return np.clip(cdf, 0.0, 1.0) + else: + raise ValueError(f"unknown dist {dist}") + + +def gen_synthetic_df( + n: int, + dist: str, + *, + lam: float, + z_sigma_cm: float = 5.0, + z_range_cm: float = 10.0, + sigma_X_given_Q: float = 0.5, + a0: float = 10.0, + a1: float = 0.5, + b0: float = 50.0, + b1: float = 2.0, +) -> tuple[pd.DataFrame, dict]: + Q = gen_Q_from_distribution(dist, n, lam=lam) + z = np.clip(RNG.normal(0.0, z_sigma_cm, size=n), -z_range_cm, z_range_cm) + a_true = a0 + a1 * z + b_true = (b0 + b1 * z / max(z_range_cm, 1e-6)).clip(min=5.0) + X = a_true + b_true * Q + RNG.normal(0.0, sigma_X_given_Q, size=n) + df = pd.DataFrame({ + "channel_id": np.repeat("ch0", n), + "Q": Q, + "X": X, + "z_vtx": z, + "is_outlier": np.zeros(n, dtype=bool), + }) + truth = { + "a0": a0, "a1": a1, + "b0": b0, "b1": b1, + "sigma_X_given_Q": sigma_X_given_Q, + "z_range": z_range_cm, + "lam": lam + } + return df, truth + + +# ----------------------- Helpers for expectations ----------------------- + +def _edges_from_centers(centers: np.ndarray) -> np.ndarray: + mid = 0.5 * (centers[1:] + centers[:-1]) + first = centers[0] - (mid[0] - centers[0]) + last = centers[-1] + (centers[-1] - mid[-1]) + return np.concatenate([[first], mid, [last]]) + + +def expected_b_per_zbin_from_sample(df: pd.DataFrame, z_edges: np.ndarray, truth: dict) -> np.ndarray: + z_vals = df["z_vtx"].to_numpy(np.float64) + b_true_all = (truth["b0"] + truth["b1"] * z_vals / max(truth["z_range"], 1e-6)).clip(min=5.0) + b_expected = [] + for i in range(len(z_edges) - 1): + m = (z_vals >= z_edges[i]) & (z_vals <= z_edges[i+1]) + b_expected.append(np.mean(b_true_all[m]) if m.sum() > 0 else np.nan) + return np.array(b_expected, dtype=np.float64) + + +def weights_from_fit_stats(col: pd.Series) -> np.ndarray: + w = [] + for s in col: + try: + d = json.loads(s) + except Exception: + d = {} + w.append(d.get("n_used", np.nan)) + return np.array(w, dtype=float) + + +# ----------------------------- Benchmark core ----------------------------- + +def run_one( + dist: str, + n: int, + *, + q_step=0.025, + dq=0.05, + z_bins=20, + lam=50.0, + sample_fraction=0.006, + mem_profile: bool = False, +) -> dict: + df, truth = gen_synthetic_df(n, dist, lam=lam) + q_centers = np.arange(0.0, 1.0 + 1e-12, q_step) # 0..1 inclusive + + def _do_fit(): + return fit_quantile_linear_nd( + df, + channel_key="channel_id", + q_centers=q_centers, + dq=dq, + nuisance_axes={"z": "z_vtx"}, + n_bins_axes={"z": z_bins}, + ) + + t0 = time.perf_counter() + if mem_profile: + try: + from memory_profiler import memory_usage + mem_trace, table = memory_usage((_do_fit, ), retval=True, max_iterations=1) + peak_mem_mb = float(np.max(mem_trace)) if len(mem_trace) else np.nan + except Exception as e: + warnings.warn(f"memory_profiler unavailable or failed: {e}") + table = _do_fit() + peak_mem_mb = np.nan + else: + table = _do_fit() + peak_mem_mb = np.nan + t_fit = time.perf_counter() - t0 + + # Expected b per z-bin (from sample) + z_centers = np.sort(table["z_center"].unique()) + z_edges = _edges_from_centers(z_centers) + b_exp = expected_b_per_zbin_from_sample(df, z_edges, truth) + + # Measured b per z-bin (weighted by window n_used) + b_meas_w = np.full_like(b_exp, np.nan, dtype=float) + for i, zc in enumerate(z_centers): + g = table[table["z_center"] == zc] + if g.empty: + continue + w = weights_from_fit_stats(g["fit_stats"]) + ok = np.isfinite(g["b"].to_numpy()) & (w > 0) + if ok.sum() == 0: + continue + bvals = g["b"].to_numpy()[ok] + ww = w[ok] + b_meas_w[i] = np.average(bvals, weights=ww) + + # Informative mask + m = np.isfinite(b_meas_w) & np.isfinite(b_exp) + + # Slope error metrics + rms_b = float(np.sqrt(np.nanmean((b_meas_w[m] - b_exp[m]) ** 2))) if m.any() else np.nan + + # sigma_Q check (median rel err by z-bin) + sigma_q_true = truth["sigma_X_given_Q"] / np.maximum(1e-9, b_exp) + sigma_q_meas = table.groupby("z_center")["sigma_Q"].median().reindex(z_centers).to_numpy() + mk = np.isfinite(sigma_q_true) & np.isfinite(sigma_q_meas) + rel_err_sigmaQ = float(np.nanmean(np.abs(sigma_q_meas[mk] - sigma_q_true[mk]) / + np.maximum(1e-9, sigma_q_true[mk]))) if mk.any() else np.nan + + # Round-trip inversion RMS (sample to limit CPU) + k = max(1, int(round(sample_fraction * len(df)))) + idx = RNG.choice(len(df), size=min(k, len(df)), replace=False) + evalr = QuantileEvaluator(table) + resid = [] + for ii in idx: + z = float(df.loc[ii, "z_vtx"]) + q_true = float(df.loc[ii, "Q"]) + x = float(df.loc[ii, "X"]) + q_hat = evalr.invert_rank(x, channel_id="ch0", z=z) + resid.append(q_hat - q_true) + resid = np.array(resid, dtype=float) + rms_rt = float(np.sqrt(np.mean(np.square(resid)))) if resid.size else np.nan + + return dict( + dist=dist, N=int(n), + lam=float(lam), + q_step=float(q_step), dq=float(dq), z_bins=int(z_bins), + t_fit=float(t_fit), + rms_b=rms_b, + rel_err_sigmaQ=rel_err_sigmaQ, + rms_rt=rms_rt, + n_z_inf=int(np.sum(m)), + peak_mem_mb=peak_mem_mb, + ) + + +def fit_log_slope(xs: np.ndarray, ys: np.ndarray) -> float: + # Fit log(ys) ~ alpha * log(xs) + c ; return alpha + m = np.isfinite(xs) & np.isfinite(ys) & (ys > 0) + if m.sum() < 2: + warnings.warn("Insufficient points for scaling slope fit.", RuntimeWarning) + return np.nan + lx = np.log(xs[m].astype(float)) + ly = np.log(ys[m].astype(float)) + A = np.column_stack([lx, np.ones_like(lx)]) + sol, _, _, _ = np.linalg.lstsq(A, ly, rcond=None) + return float(sol[0]) + + +def _plot_scaling(res: pd.DataFrame, dists: list[str]): + try: + import matplotlib.pyplot as plt + except Exception as e: + warnings.warn(f"--plot requested but matplotlib unavailable: {e}") + return + for dist in dists: + sub = res[res["dist"] == dist].sort_values("N") + if sub.empty: + continue + fig, ax = plt.subplots(figsize=(6.2, 4.2), dpi=140) + ax.loglog(sub["N"], sub["rms_b"], marker="o", label="rms_b") + ax.loglog(sub["N"], sub["rms_rt"], marker="s", label="rms_rt") + ax.set_title(f"Scaling — {dist}") + ax.set_xlabel("N events") + ax.set_ylabel("Error") + ax.grid(True, which="both", ls=":") + ax.legend() + fig.tight_layout() + fig.savefig(f"bench_scaling_{dist}.png") + plt.close(fig) + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--Ns", type=str, default="2000,5000,10000,20000,50000", + help="comma-separated N list") + ap.add_argument("--dists", type=str, default="uniform,poisson,gaussian", + help="comma-separated distributions") + ap.add_argument("--lam", type=float, default=50.0, help="Poisson mean λ") + ap.add_argument("--q_step", type=float, default=0.025, help="q_center step") + ap.add_argument("--dq", type=float, default=0.05, help="Δq window") + ap.add_argument("--z_bins", type=int, default=20, help="# z bins") + ap.add_argument("--sample_fraction", type=float, default=0.006, help="fraction for round-trip sampling") + ap.add_argument("--plot", action="store_true", help="save log-log plots as PNGs") + ap.add_argument("--save_csv", type=str, default="", help="path to save CSV results") + ap.add_argument("--mem_profile", action="store_true", help="profile peak memory (if memory_profiler available)") + ap.add_argument("--strict", action="store_true", help="raise AssertionError on scaling deviations") + ap.add_argument("--scaling_tol", type=float, default=0.20, help="tolerance for |alpha_b + 0.5|") + ap.add_argument("--rt_tol", type=float, default=0.10, help="tolerance for |alpha_rt - 0.0|") + args = ap.parse_args() + + Ns = [int(s) for s in args.Ns.split(",") if s.strip()] + dists = [s.strip() for s in args.dists.split(",") if s.strip()] + + print(f"Distributions: {', '.join(dists)} (Poisson uses PIT, λ={args.lam})") + print(f"q_step={args.q_step}, dq={args.dq}, z_bins={args.z_bins}, sample_fraction={args.sample_fraction}\n") + + rows = [] + for dist in dists: + print(f"=== Benchmark: {dist} ===") + for N in Ns: + r = run_one( + dist, N, + q_step=args.q_step, dq=args.dq, z_bins=args.z_bins, + lam=args.lam, sample_fraction=args.sample_fraction, + mem_profile=args.mem_profile, + ) + rows.append(r) + print(f"N={N:7d} | t_fit={r['t_fit']:.3f}s | rms_b={r['rms_b']:.5f} " + f"(rms_b*√N={r['rms_b']*sqrt(N):.5f}) | σQ_rel={r['rel_err_sigmaQ']:.3f} | " + f"rt_rms={r['rms_rt']:.5f} (rt_rms*√N={r['rms_rt']*sqrt(N):.5f}) | " + f"z_inf={r['n_z_inf']} | mem={r['peak_mem_mb']:.1f}MB") + + res = pd.DataFrame(rows) + + # Scaling summary per distribution + print("\n=== Scaling summary (expect: α_b ≈ -0.5, α_rt ≈ 0.0) ===") + summary_rows = [] + for dist in dists: + sub = res[res["dist"] == dist].sort_values("N") + a_b = fit_log_slope(sub["N"].to_numpy(), sub["rms_b"].to_numpy()) + a_rt = fit_log_slope(sub["N"].to_numpy(), sub["rms_rt"].to_numpy()) + const_b = (sub["rms_b"] * np.sqrt(sub["N"])).to_numpy() + print(f"{dist:8s} | α_b={a_b: .3f} (→ -0.5) | α_rt={a_rt: .3f} (→ 0.0) " + f"| mean(rms_b√N)={np.nanmean(const_b):.5f}") + summary_rows.append({"dist": dist, "alpha_rms_b": a_b, "alpha_rms_rt": a_rt}) + summary = pd.DataFrame(summary_rows) + + # CSV export + if args.save_csv: + res.to_csv(args.save_csv, index=False) + print(f"\nSaved CSV to: {args.save_csv}") + + # Plots + if args.plot: + _plot_scaling(res, dists) + print("Saved PNG plots:", ", ".join(f"bench_scaling_{d}.png" for d in dists)) + + + # Checks (warn by default; --strict to raise) + for dist in dists: + a_b = float(summary[summary["dist"] == dist]["alpha_rms_b"].iloc[0]) + a_rt = float(summary[summary["dist"] == dist]["alpha_rms_rt"].iloc[0]) + ok_b = np.isfinite(a_b) and (abs(a_b + 0.5) <= args.scaling_tol) + ok_rt = np.isfinite(a_rt) and (abs(a_rt - 0.0) <= args.rt_tol) + msg = f"SCALING [{dist}] α_b={a_b:.3f} vs -0.5 (tol {args.scaling_tol}) | α_rt={a_rt:.3f} vs 0.0 (tol {args.rt_tol})" + if ok_b and ok_rt: + print("PASS - " + msg) + else: + if args.strict: + raise AssertionError("FAIL - " + msg) + warnings.warn("WARN - " + msg) + + +if __name__ == "__main__": + main() diff --git a/UTILS/dfextensions/quantile_fit_nd/contextLLM.md b/UTILS/dfextensions/quantile_fit_nd/contextLLM.md new file mode 100644 index 000000000..7b5e3a483 --- /dev/null +++ b/UTILS/dfextensions/quantile_fit_nd/contextLLM.md @@ -0,0 +1,99 @@ +# contextLLM.md — ND Quantile Linear Fit (quick context) + +## TL;DR + +We fit a **local linear inverse quantile model** per channel and nuisance grid: +[ +X(q,n) \approx a(q_0,n) + b(q_0,n),\underbrace{(q - q_0)}_{\Delta q},\quad b>0 +] + +* Monotonic in **q** via (b \gt b_\text{min}). +* Smooth in nuisance axes (e.g., **z**, later **η**, **time**) via separable interpolation. +* **Discrete inputs** (tracks/clusters/Poisson): convert to **continuous ranks** (PIT or mid-ranks) *before* fitting. + +## Key Files + +* `dfextensions/quantile_fit_nd/quantile_fit_nd.py` — core fitter + evaluator +* `dfextensions/quantile_fit_nd/utils.py` — discrete→uniform helpers (PIT/mid-rank) +* `dfextensions/quantile_fit_nd/test_quantile_fit_nd.py` — unit tests + rich diagnostics +* `dfextensions/quantile_fit_nd/bench_quantile_fit_nd.py` — speed & precision benchmark, scaling plots +* `dfextensions/quantile_fit_nd/quantile_fit_nd.md` — full spec (math, API, guarantees) + +## Core Assumptions & Policies + +* **Δq-centered OLS** per window (|Q-q_0|\le \Delta q), default (\Delta q=0.05). +* **Monotonicity**: enforce (b \ge b_\text{min}) (configurable; “auto” heuristic or fixed). +* **Nuisance interpolation**: separable (linear now; PCHIP later); only q must be monotone. +* **Discrete inputs**: + + * Prefer **randomized PIT**: (U=F(k!-!1)+V,[F(k)-F(k!-!1)]), (V\sim\text{Unif}(0,1)). + * Or **mid-ranks**: (U=\tfrac{F(k!-!1)+F(k)}{2}) (deterministic). + * Helpers: `discrete_to_uniform_rank_poisson`, `discrete_to_uniform_rank_empirical`. +* **Uncertainty**: (\sigma_Q \approx \sigma_{X|Q}/|b|). Irreducible vs reducible split available downstream. + +## Public API (stable) + +```python +from dfextensions.quantile_fit_nd.quantile_fit_nd import fit_quantile_linear_nd, QuantileEvaluator + +table = fit_quantile_linear_nd( + df, # columns: channel_id, Q, X, nuisance cols (e.g. z_vtx), is_outlier (optional) + channel_key="channel_id", + q_centers=np.arange(0, 1.0001, 0.025), + dq=0.05, + nuisance_axes={"z": "z_vtx"}, # later: {"z":"z_vtx","eta":"eta","time":"timestamp"} + n_bins_axes={"z": 20}, + mask_col="is_outlier", + b_min_option="auto", # or "fixed" +) + +evalr = QuantileEvaluator(table) +q_hat = evalr.invert_rank(X=123.0, channel_id="ch0", z=1.2) +a, b, sigmaQ = evalr.params(channel_id="ch0", q=0.4, z=0.0) +``` + +### Output table (columns) + +`channel_id, q_center, _center..., a, b, sigma_Q, sigma_Q_irr (optional), dX_dN (optional), db_d..., fit_stats(json), timestamp(optional)` + +## Quickstart (clean run) + +```bash +# 1) Unit tests with diagnostics +pytest -q -s dfextensions/quantile_fit_nd/test_quantile_fit_nd.py + +# 2) Benchmark speed + precision + scaling (and plots) +python dfextensions/quantile_fit_nd/bench_quantile_fit_nd.py --plot \ + --dists uniform,poisson,gaussian --Ns 2000,5000,10000,20000,50000 --lam 50 +``` + +* **Interpretation**: `rms_b ~ N^{-1/2}` (α≈−0.5); `rms_rt ~ const` (α≈0) because round-trip error is per-event. + +## Reproducibility knobs + +* RNG seed fixed in tests/bench (`RNG = np.random.default_rng(123456)`). +* Poisson rank mode: randomized PIT (default) vs mid-rank (deterministic) — switch in test/bench helpers. +* Scaling tolerances (`--scaling_tol`, `--rt_tol`) in the benchmark. + +## Known Limitations + +* Very edge q windows (near 0 or 1) can be data-sparse; we store fit_stats and may skip non-informative windows. +* With extremely discrete/uniform ranks (without PIT), OLS degenerate: fitter will flag `low_Q_spread`. +* Current interpolation is linear; PCHIP (shape-preserving) can be enabled later. +* Inversion uses a stable linear local model and bracketing; works inside grid, clips at edges. + +## Next Steps (nice-to-have) + +* Optional robust fit (`fit_mode="huber"`), once outlier flags stabilize. +* Add time as a nuisance axis or do time-sliced parallel fits + chain. +* Export ROOT trees consistently (Parquet/Arrow already supported). +* Add ML-friendly derivative grids (db/dz, db/dη) at higher resolution. + +## Troubleshooting + +* **ImportError in tests**: ensure `dfextensions/quantile_fit_nd/__init__.py` exists and you run from repo root. +* **.idea committed**: add `.idea/` to repo-level `.gitignore` to avoid IDE noise. +* **Poisson looks “nonsense”**: confirm PIT/mid-rank preprocessing of counts before calling `fit_*`. + +--- + diff --git a/UTILS/dfextensions/quantile_fit_nd/diff.txt b/UTILS/dfextensions/quantile_fit_nd/diff.txt new file mode 100644 index 000000000..366052af4 --- /dev/null +++ b/UTILS/dfextensions/quantile_fit_nd/diff.txt @@ -0,0 +1,203 @@ +diff --git a/UTILS/dfextensions/quantile_fit_nd/quantile_fit_nd.py b/UTILS/dfextensions/quantile_fit_nd/quantile_fit_nd.py +index c757cbc6..bcc0c8c4 100644 +--- a/UTILS/dfextensions/quantile_fit_nd/quantile_fit_nd.py ++++ b/UTILS/dfextensions/quantile_fit_nd/quantile_fit_nd.py +@@ -107,7 +107,7 @@ def fit_quantile_linear_nd( + nuisance_axes: Dict[str, str] = None, # e.g. {"z": "z_vtx", "eta": "eta"} + n_bins_axes: Dict[str, int] = None, # e.g. {"z": 10} + mask_col: Optional[str] = "is_outlier", +- b_min_option: str = "auto", # "auto" or "fixed" ++ b_min_option: str = "auto", # "auto" or "fixed" + b_min_value: float = 1e-6, + fit_mode: str = "ols", + kappa_w: float = 1.3, +@@ -117,32 +117,33 @@ def fit_quantile_linear_nd( + Fit local linear inverse-CDF per channel, per (q_center, nuisance bins). + Returns a flat DataFrame (calibration table) with coefficients and diagnostics. + +- Columns expected: ++ Columns expected in df: + - channel_key, Q, X, and nuisance columns per nuisance_axes dict. + - mask_col (optional): True rows are excluded. + + Notes: +- - degree-1 only, Δq-centered model. ++ - Degree-1 only, Δq-centered model: X = a + b*(Q - q_center). + - b>0 enforced via floor (auto/fixed). + - sigma_Q = sigma_X|Q / |b| +- - sigma_Q_irr optional (needs dX/dN proxy; here left NaN by default). ++ - sigma_Q_irr left NaN unless a multiplicity model is provided downstream. + """ + if nuisance_axes is None: + nuisance_axes = {} + if n_bins_axes is None: + n_bins_axes = {ax: 10 for ax in nuisance_axes} ++ + df = df.copy() + ++ # Ensure a boolean keep-mask exists + if mask_col is None or mask_col not in df.columns: + df["_mask_keep"] = True + mask_col_use = "_mask_keep" + else: + mask_col_use = mask_col + +- # Prepare nuisance bin centers per axis ++ # ------------------------ build nuisance binning ------------------------ + axis_to_centers: Dict[str, np.ndarray] = {} + axis_to_idxcol: Dict[str, str] = {} +- + for ax, col in nuisance_axes.items(): + centers = _build_uniform_centers(df[col].to_numpy(np.float64), int(n_bins_axes.get(ax, 10))) + axis_to_centers[ax] = centers +@@ -150,21 +151,18 @@ def fit_quantile_linear_nd( + df[idxcol] = _assign_bin_indices(df[col].to_numpy(np.float64), centers) + axis_to_idxcol[ax] = idxcol + +- # Group by channel and nuisance bin tuple + bin_cols = [axis_to_idxcol[a] for a in nuisance_axes] +- out_rows = [] ++ out_rows: list[dict] = [] + +- # iterate per channel ++ # --------------------------- iterate channels -------------------------- + for ch_val, df_ch in df.groupby(channel_key, sort=False, dropna=False): + # iterate bins of nuisance axes + if bin_cols: + if len(bin_cols) == 1: +- # avoid FutureWarning: use scalar grouper when only one column +- gb = df_ch.groupby(bin_cols[0], sort=False, dropna=False) ++ gb = df_ch.groupby(bin_cols[0], sort=False, dropna=False) # avoid FutureWarning + else: + gb = df_ch.groupby(bin_cols, sort=False, dropna=False) + else: +- # single group with empty tuple key + df_ch = df_ch.copy() + df_ch["__bin_dummy__"] = 0 + gb = df_ch.groupby(["__bin_dummy__"], sort=False, dropna=False) +@@ -174,18 +172,23 @@ def fit_quantile_linear_nd( + bin_key = (bin_key,) + + # select non-outliers +- gmask = (df_cell[mask_col_use] == False) if mask_col_use in df_cell.columns else np.ones(len(df_cell), dtype=bool) +- if gmask.sum() < 6: +- # record empty cells as NaN rows for all q_centers (optional) ++ keep = (df_cell[mask_col_use] == False) if mask_col_use in df_cell.columns else np.ones(len(df_cell), dtype=bool) ++ n_keep = int(keep.sum()) ++ masked_frac = 1.0 - float(keep.mean()) ++ ++ X_all = df_cell.loc[keep, "X"].to_numpy(np.float64) ++ Q_all = df_cell.loc[keep, "Q"].to_numpy(np.float64) ++ ++ # If too few points overall, emit NaNs for all q-centers in this cell ++ if n_keep < 6: + for q0 in q_centers: + row = { + "channel_id": ch_val, + "q_center": float(q0), + "a": np.nan, "b": np.nan, "sigma_Q": np.nan, + "sigma_Q_irr": np.nan, "dX_dN": np.nan, +- "fit_stats": json.dumps({"n_used": int(gmask.sum()), "ok": False, "masked_frac": float(1.0 - gmask.mean())}) ++ "fit_stats": json.dumps({"n_used": n_keep, "ok": False, "masked_frac": float(masked_frac)}) + } +- # write nuisance centers + for ax_i, ax in enumerate(nuisance_axes): + row[f"{ax}_center"] = float(axis_to_centers[ax][bin_key[ax_i]]) + if timestamp is not None: +@@ -193,33 +196,44 @@ def fit_quantile_linear_nd( + out_rows.append(row) + continue + +- X_all = df_cell.loc[gmask, "X"].to_numpy(np.float64) +- Q_all = df_cell.loc[gmask, "Q"].to_numpy(np.float64) +- +- # stats for auto floor +- sigmaX_cell = float(np.std(X_all)) if X_all.size > 1 else 0.0 +- bmin = _auto_b_min(sigmaX_cell, dq) if b_min_option == "auto" else float(b_min_value) +- +- masked_frac = 1.0 - float(gmask.mean()) +- ++ # -------------------- per-q_center sliding window -------------------- + for q0 in q_centers: + in_win = (Q_all >= q0 - dq) & (Q_all <= q0 + dq) +- if in_win.sum() < 6: ++ n_win = int(in_win.sum()) ++ ++ # window-local auto b_min (compute BEFORE branching to avoid NameError) ++ if b_min_option == "auto": ++ if n_win > 1: ++ sigmaX_win = float(np.std(X_all[in_win])) ++ else: ++ # fallback to overall scatter in this cell ++ sigmaX_win = float(np.std(X_all)) if X_all.size > 1 else 0.0 ++ bmin = _auto_b_min(sigmaX_win, dq) ++ else: ++ bmin = float(b_min_value) ++ ++ if n_win < 6: + row = { + "channel_id": ch_val, + "q_center": float(q0), + "a": np.nan, "b": np.nan, "sigma_Q": np.nan, + "sigma_Q_irr": np.nan, "dX_dN": np.nan, +- "fit_stats": json.dumps({"n_used": int(in_win.sum()), "ok": False, "masked_frac": masked_frac}) ++ "fit_stats": json.dumps({ ++ "n_used": n_win, "ok": False, ++ "masked_frac": float(masked_frac), ++ "b_min": float(bmin) ++ }) + } + else: + a, b, sigX, n_used, stats = _local_fit_delta_q(Q_all[in_win], X_all[in_win], q0) ++ + # monotonicity floor + if not np.isfinite(b) or b <= 0.0: + b = bmin + clipped = True + else: + clipped = False ++ + sigma_Q = _sigma_Q_from_sigmaX(b, sigX) + fit_stats = { + "n_used": int(n_used), +@@ -237,7 +251,7 @@ def fit_quantile_linear_nd( + "fit_stats": json.dumps(fit_stats) + } + +- # write nuisance centers ++ # write nuisance centers and optional timestamp + for ax_i, ax in enumerate(nuisance_axes): + row[f"{ax}_center"] = float(axis_to_centers[ax][bin_key[ax_i]]) + if timestamp is not None: +@@ -246,7 +260,7 @@ def fit_quantile_linear_nd( + + table = pd.DataFrame(out_rows) + +- # Attach metadata ++ # ------------------------------ metadata ------------------------------ + table.attrs.update({ + "model": "X = a + b*(Q - q_center)", + "dq": float(dq), +@@ -258,21 +272,17 @@ def fit_quantile_linear_nd( + "channel_key": channel_key, + }) + +- # Finite-diff derivatives along nuisance axes (db_d) ++ # --------- finite-difference derivatives along nuisance axes ---------- + for ax in nuisance_axes: +- # compute per-channel, per-q_center derivative across axis centers + der_col = f"db_d{ax}" + table[der_col] = np.nan +- # group by channel & q_center + for (ch, q0), g in table.groupby(["channel_id", "q_center"], sort=False): + centers = np.unique(g[f"{ax}_center"].to_numpy(np.float64)) + if centers.size < 2: + continue +- # sort by center + gg = g.sort_values(f"{ax}_center") + bvals = gg["b"].to_numpy(np.float64) + xc = gg[f"{ax}_center"].to_numpy(np.float64) +- # central differences + d = np.full_like(bvals, np.nan) + if bvals.size >= 2: + d[0] = (bvals[1] - bvals[0]) / max(xc[1] - xc[0], 1e-12) diff --git a/UTILS/dfextensions/quantile_fit_nd/quantile_fit_nd.md b/UTILS/dfextensions/quantile_fit_nd/quantile_fit_nd.md new file mode 100644 index 000000000..ce44ee0f0 --- /dev/null +++ b/UTILS/dfextensions/quantile_fit_nd/quantile_fit_nd.md @@ -0,0 +1,270 @@ +# quantile_fit_nd — Generic ND Quantile Linear Fitting Framework +**Version:** v3.1 +**Status:** Implementation Ready + +--- + +## 1. Overview + +This module provides a detector-agnostic framework for **quantile-based linear fitting** used in calibration, combined multiplicity estimation, and flow monitoring. + +We approximate the local inverse quantile function around each quantile grid point $q_0$ as: + +$$ +X(Q, \mathbf{n}) \;=\; a(q_0,\mathbf{n}) \;+\; b(q_0,\mathbf{n}) \cdot (Q - q_0) +$$ + +where: +- $Q$ is the quantile rank of the amplitude, +- $\mathbf{n}$ are nuisance coordinates (e.g., $z_{\mathrm{vtx}}, \eta, t$), +- \(a\) is the OLS intercept at \(q_0\), +- \(b>0\) is the local slope (monotonicity in \(Q\)). + +The framework outputs **tabulated coefficients and diagnostics** in a flat DataFrame for time-series monitoring, ML downstream use, and export to Parquet/Arrow/ROOT. + +--- + +## 2. Directory contents + +| File | Role | +|---|---| +| `quantile_fit_nd.py` | Implementation (fit, interpolation, evaluator, I/O) | +| `test_quantile_fit_nd.py` | Unit & synthetic tests | +| `quantile_fit_nd.md` | This design & usage document | + +--- + +## 3. Goals + +1. Fit local linear inverse-CDF per **channel** with monotonicity in $Q$. +2. Smooth over nuisance axes with separable interpolation (linear/PCHIP). +3. Provide **physics-driven** slope floors to avoid rank blow-ups. +4. Store results as **DataFrames** with rich diagnostics and metadata. +5. Keep the API **detector-independent** (no detector ID in core interface). + +--- + +## 4. Required input columns + +| Column | Description | +|---|---| +| `channel_id` | Unique local channel key | +| `Q` | Quantile rank (normalized by detector reference) | +| `X` | Measured amplitude (or normalized signal) | +| `z_vtx`, `eta`, `time` | Nuisance coordinates (configurable subset) | +| `is_outlier` | Optional boolean mask; `True` rows are excluded from fits | + +> Preprocessing (e.g., timing outliers) is expected to fill `is_outlier`. + +--- + +## 5. Output table schema + +The fit returns a flat, appendable table with explicit grid points. + +| Column | Description | +|---|---| +| `channel_id` | Channel identifier | +| `q_center` | Quantile center of the local fit | +| `_center` | Centers of nuisance bins (e.g., `z_center`) | +| `a` | Intercept (from OLS at $q_0$) | +| `b` | Slope (clipped to $b_{\min}>0$ if needed) | +| `sigma_Q` | Total quantile uncertainty $ \sigma_{X|Q} / |b| $ | +| `sigma_Q_irr` | Irreducible error (from multiplicity fluctuation) | +| `dX_dN` | Sensitivity to multiplicity proxy (optional) | +| `db_d` | Finite-difference derivative along each nuisance axis | +| `fit_stats` | JSON with `Npoints`, `RMS`, `chi2_ndf`, `masked_frac`, `clipped_frac` | +| `timestamp` | Calibration/run time (optional) | + +**Example metadata stored in `DataFrame.attrs`:** +```json +{ + "model": "X = a + b*(Q - q_center)", + "dq": 0.05, + "b_min_option": "auto", + "b_min_formula": "b_min = 0.25 * sigma_X / (2*dq)", + "axes": ["q", "z"], + "fit_mode": "ols", + "kappa_w": 1.3 +} +```` + +--- + +## 6. Fit procedure (per channel, per grid cell) + +1. **Window selection**: select rows with (|Q - q_0| \le \Delta q) (default (\Delta q=0.05)). +2. **Masking**: use rows where `is_outlier == False`. Record masked fraction. +3. **Local regression**: OLS fit of (X) vs ((Q-q_0)) → coefficients (a, b). +4. **Uncertainty**: + +- Residual RMS → $\sigma_{X|Q}$ +- Total quantile uncertainty: $ \sigma_Q = \sigma_{X|Q} / |b| $ +- Irreducible term: $ \sigma_{Q,\mathrm{irr}} = |dX/dN| \cdot \sigma_N / |b| $ with $\sigma_N \approx \kappa_w \sqrt{N_{\text{proxy}}}$ +5. **Monotonicity**: + + - Enforce $ b > b_{\min} $. + * Floor policy: + + * `"auto"`: ( b_{\min} = 0.25 \cdot \sigma_X / (2\Delta q) ) (heuristic) + * `"fixed"`: constant floor (default (10^{-6})) + * Record `clipped_frac` in `fit_stats`. +6. **Tabulation**: write row with coefficients, diagnostics, and centers of nuisance bins. + +**Edge quantiles**: same $\Delta q$ policy near $q=0,1$ (no special gating by default). + +--- + +## 7. Interpolation and monotonicity preservation + +* **Separable interpolation** along nuisance axes (e.g., `z`, `eta`, `time`) using linear or shape-preserving PCHIP. +* **Monotone axis**: (Q). At evaluation: nearest or linear between adjacent `q_center` points. +* **Guarantee**: if all tabulated $b>0$ and nuisance interpolation does not cross zero, monotonicity in $Q$ is preserved. Any interpolated $b \le 0$ is clipped to $b_{\min}$. + +Correlations between nuisance axes are **diagnosed** (scores stored) but **not** modeled by tensor interpolation in v3.1. + +--- + +## 8. Public API (summary) + +### Fitting + +```python +fit_quantile_linear_nd( + df, + channel_key="channel_id", + q_centers=np.linspace(0, 1, 11), + dq=0.05, + nuisance_axes={"z": "z_vtx"}, # add {"eta": "eta"}, {"time": "timestamp"} later + mask_col="is_outlier", + b_min_option="auto", # or "fixed" + fit_mode="ols" # "huber" optional in later versions +) -> pandas.DataFrame +``` + +### Evaluation + +```python +eval = QuantileEvaluator(result_table) + +# Interpolated parameters at coordinates: +a, b, sigma_Q = eval.params(channel_id=42, q=0.40, z=2.1) + +# Invert amplitude to rank (clip to [0,1]): +Q = eval.invert_rank(X=123.0, channel_id=42, z=2.1) +``` + +### Persistence + +```python +save_table(df, "calibration.parquet") +save_table(df, "calibration.arrow", fmt="arrow") +save_table(df, "calibration.root", fmt="root") # requires uproot/PyROOT +df2 = load_table("calibration.parquet") +``` + +--- + +## 9. Derivatives & irreducible error + +* **Finite differences** for `db_dz`, `db_deta` at grid centers (central where possible; forward/backward at edges). +* **Irreducible error** (stored as `sigma_Q_irr`): +$ \sigma_{Q,\mathrm{irr}} = |dX/dN| \cdot \sigma_N / |b| $, with $\sigma_N = \kappa_w \sqrt{N_{\text{proxy}}}$. + `kappa_w` (default 1.3) reflects weight fluctuations (documented constant; can be overridden). + +> For data without truth $N$, $dX/dN$ may be estimated against a stable multiplicity proxy from the combined estimator. + +--- + +## 10. QA & summaries + +Optional **per-channel summary** rows per calibration period: + +* mean/median of `sigma_Q`, +* `%` of cells clipped by `b_min`, +* masked fraction, +* residual RMS, `chi2_ndf`, +* counts of fitted vs. skipped cells. + +Drift/stability analysis is expected in external tooling by **chaining** calibration tables over time. + +--- + +## 11. Unit & synthetic tests (see `test_quantile_fit_nd.py`) + +| Test ID | Purpose | +| ------- | --------------------------------------------- | +| T00 | Smoke test (single channel, (q,z) grid) | +| T01 | Monotonicity enforcement (all (b > b_{\min})) | +| T02 | Edge behavior near (q\in{0,1}) per policy | +| T03 | Outlier masking stability | +| T04 | (\sigma_Q) scaling vs injected noise | +| T05 | `db_dz` finite-diff accuracy on known slope | +| T06 | Round-trip (Q \to X \to Q) small residual | +| T07 | Parquet/Arrow/ROOT save/load parity | + +--- + +## 12. Performance expectations + +| Aspect | Estimate | +| --------------- | -------------------------------------------------------- | +| Complexity | (O(N \cdot \Delta q)) per channel | +| CPU | (q,z) fit: seconds; ND adds ~20–30% from interpolation | +| Parallelization | Natural via Pandas/Dask groupby | +| Table size | (O(\text{grid points} \times \text{channels})), MB-scale | +| Storage | Parquet typically < 10 MB per calibration slice | + +--- + +## 13. Configurable parameters + +| Name | Default | Meaning | +| --------------- | ---------------- | ---------------------------------------- | +| `dq` | 0.05 | Quantile window half-width | +| `b_min_option` | `auto` | Slope floor policy (`auto` or `fixed`) | +| `fit_mode` | `ols` | Regression type | +| `mask_col` | `is_outlier` | Outlier flag column | +| `kappa_w` | 1.3 | Weight-fluctuation factor (doc/override) | +| `nuisance_axes` | `{"z": "z_vtx"}` | Axes for smoothing | + +--- +## 14. Discrete Inputs (policy) + +**Assumption:** Within each sliding window \(|Q - q_0| \le \Delta q\), the rank \(Q\) has enough spread to estimate a slope. + +For **discrete sources** (e.g. integer tracks/clusters, Poisson-like): +convert counts to a continuous rank **before** calling the fitter: + +- **Randomized PIT (preferred):** + \( U = F(k-1) + V\,[F(k)-F(k-1)], \ V\sim \mathrm{Unif}(0,1) \) + (exact Uniform(0,1), information-preserving). +- **Mid-ranks (deterministic):** + \( \tilde U = \tfrac{F(k-1)+F(k)}{2} \). + +Helpers provided: +- `discrete_to_uniform_rank_poisson(k, lam, mode='randomized'|'midrank')` +- `discrete_to_uniform_rank_empirical(x, mode='randomized'|'midrank')` + +The core fitter does **not** widen Δq or inject noise; it will label windows +with insufficient spread as `low_Q_spread`. This separation keeps the +calibration math transparent and reproducible. + + + +## 15. Future extensions + +* Optional **Huber** robust regression mode. +* Degree-2 local fits with derivative-based monotonicity checks. +* Covariance modeling across nuisance axes. +* Adaptive time binning based on drift thresholds. +* ML-ready derivatives and cost-function integration. + +--- + +## 15. References + +* PWG-P context: combined multiplicity/flow estimator materials. +* RootInteractive / AliasDataFrame pipelines for calibration QA. + +--- diff --git a/UTILS/dfextensions/quantile_fit_nd/quantile_fit_nd.py b/UTILS/dfextensions/quantile_fit_nd/quantile_fit_nd.py new file mode 100644 index 000000000..5abbb005d --- /dev/null +++ b/UTILS/dfextensions/quantile_fit_nd/quantile_fit_nd.py @@ -0,0 +1,562 @@ +# dfextension/quantile_fit_nd/quantile_fit_nd.py +# v3.1 — ND quantile linear fitting (Δq-centered), separable interpolation, evaluator, and I/O. +# Dependencies: numpy, pandas (optional: pyarrow, fastparquet, scipy for PCHIP) +from __future__ import annotations +from dataclasses import dataclass +from typing import Dict, Tuple, Optional, Sequence, Any +import json +import warnings + +import numpy as np +import pandas as pd + + +# ----------------------------- Utilities --------------------------------- + +def _ensure_array(x) -> np.ndarray: + return np.asarray(x, dtype=np.float64) + + +def _bin_edges_from_centers(centers: np.ndarray) -> np.ndarray: + """Create edges from sorted centers (extrapolate half-steps at ends).""" + c = _ensure_array(centers) + mid = 0.5 * (c[1:] + c[:-1]) + first = c[0] - (mid[0] - c[0]) + last = c[-1] + (c[-1] - mid[-1]) + return np.concatenate([[first], mid, [last]]) + + +def _build_uniform_centers(values: np.ndarray, n_bins: int) -> np.ndarray: + vmin, vmax = np.nanmin(values), np.nanmax(values) + if vmin == vmax: + # degenerate: single bin at that value + return np.array([vmin], dtype=np.float64) + return np.linspace(vmin, vmax, n_bins, dtype=np.float64) + + +def _assign_bin_indices(values: np.ndarray, centers: np.ndarray) -> np.ndarray: + """Return integer indices mapping each value to nearest center (safe, inclusive edges).""" + edges = _bin_edges_from_centers(centers) + idx = np.searchsorted(edges, values, side="right") - 1 + idx = np.clip(idx, 0, len(centers) - 1) + return idx.astype(np.int32) + + +def _linear_interp_1d(xc: np.ndarray, yc: np.ndarray, x: float) -> float: + """Piecewise-linear interpolation clamped to endpoints. yc may contain NaNs -> nearest good.""" + xc = _ensure_array(xc) + yc = _ensure_array(yc) + good = np.isfinite(yc) + if good.sum() == 0: + return np.nan + xcg, ycg = xc[good], yc[good] + if x <= xcg[0]: + return float(ycg[0]) + if x >= xcg[-1]: + return float(ycg[-1]) + j = np.searchsorted(xcg, x) + x0, x1 = xcg[j-1], xcg[j] + y0, y1 = ycg[j-1], ycg[j] + t = (x - x0) / max(x1 - x0, 1e-12) + return float((1 - t) * y0 + t * y1) + + +def _local_fit_delta_q(Qw: np.ndarray, Xw: np.ndarray, q0: float) -> Tuple[float, float, float, int, Dict[str, Any]]: + """ + Stable 2-parameter OLS in the Δq-centered model: + X = a + b * (Q - q0) + Returns: + a, b, sigma_X|Q (RMS of residuals), n_used, stats(dict) + Rejects windows with insufficient Q spread to estimate slope reliably. + """ + Qw = np.asarray(Qw, dtype=np.float64) + Xw = np.asarray(Xw, dtype=np.float64) + m = np.isfinite(Qw) & np.isfinite(Xw) + Qw, Xw = Qw[m], Xw[m] + n = Qw.size + if n < 3: + return np.nan, np.nan, np.nan, int(n), {"ok": False, "reason": "n<3"} + + dq = Qw - q0 + # Degeneracy checks for discrete/plateau windows (typical in Poisson-CDF ranks) + # Require at least 3 unique Q values and a minimal span in Q. + uq = np.unique(np.round(Qw, 6)) # rounding collapses near-duplicates + span_q = float(np.max(Qw) - np.min(Qw)) if n else 0.0 + if uq.size < 3 or span_q < 1e-3: + return np.nan, np.nan, np.nan, int(n), { + "ok": False, "reason": "low_Q_spread", "n_unique_q": int(uq.size), "span_q": span_q + } + + # Design matrix for OLS: [1, (Q - q0)] + A = np.column_stack([np.ones(n, dtype=np.float64), dq]) + # Least squares solution (stable even when dq mean ≠ 0) + sol, resid, rank, svals = np.linalg.lstsq(A, Xw, rcond=None) + a, b = float(sol[0]), float(sol[1]) + + # Residual RMS as sigma_X|Q + if n > 2: + if resid.size > 0: + rss = float(resid[0]) + else: + # fallback if lstsq doesn't return resid (e.g., rank-deficient weird cases) + rss = float(np.sum((Xw - (a + b * dq)) ** 2)) + sigmaX = float(np.sqrt(max(rss, 0.0) / (n - 2))) + else: + sigmaX = np.nan + + stats = { + "ok": True, + "rms": sigmaX, + "n_used": int(n), + "n_unique_q": int(uq.size), + "span_q": span_q, + } + return a, b, sigmaX, int(n), stats + + +def _sigma_Q_from_sigmaX(b: float, sigma_X_given_Q: float) -> float: + if not np.isfinite(b) or b == 0: + return np.nan + return float(abs(sigma_X_given_Q) / abs(b)) + + +def _auto_b_min(sigma_X: float, dq: float, c: float = 0.25) -> float: + # heuristic to avoid explosive Q when amplitude scatter is large vs window + return float(max(1e-12, c * sigma_X / max(2.0 * dq, 1e-12))) + + +# ------------------------------ Fit API ---------------------------------- + +def fit_quantile_linear_nd( + df: pd.DataFrame, + *, + channel_key: str = "channel_id", + q_centers: np.ndarray = np.linspace(0.0, 1.0, 11), + dq: float = 0.05, + nuisance_axes: Dict[str, str] = None, # e.g. {"z": "z_vtx", "eta": "eta"} + n_bins_axes: Dict[str, int] = None, # e.g. {"z": 10} + mask_col: Optional[str] = "is_outlier", + b_min_option: str = "auto", # "auto" or "fixed" + b_min_value: float = 1e-6, + fit_mode: str = "ols", + kappa_w: float = 1.3, + timestamp: Optional[Any] = None, +) -> pd.DataFrame: + """ + Fit local linear inverse-CDF per channel, per (q_center, nuisance bins). + Degree-1, Δq-centered model: X = a + b*(Q - q_center). + + Monotonicity: + - Enforce floor b>=b_min ONLY for valid fits with non-positive b. + - Degenerate windows (low Q spread / too few unique Q) remain NaN (no flooring). + + sigma_Q = sigma_X|Q / |b| + + Returns a flat DataFrame with coefficients and diagnostics. + """ + if nuisance_axes is None: + nuisance_axes = {} + if n_bins_axes is None: + n_bins_axes = {ax: 10 for ax in nuisance_axes} + + df = df.copy() + + # Ensure a boolean keep-mask exists + if mask_col is None or mask_col not in df.columns: + df["_mask_keep"] = True + mask_col_use = "_mask_keep" + else: + mask_col_use = mask_col + + # ------------------------ build nuisance binning ------------------------ + axis_to_centers: Dict[str, np.ndarray] = {} + axis_to_idxcol: Dict[str, str] = {} + for ax, col in nuisance_axes.items(): + centers = _build_uniform_centers(df[col].to_numpy(np.float64), int(n_bins_axes.get(ax, 10))) + axis_to_centers[ax] = centers + idxcol = f"__bin_{ax}" + df[idxcol] = _assign_bin_indices(df[col].to_numpy(np.float64), centers) + axis_to_idxcol[ax] = idxcol + + bin_cols = [axis_to_idxcol[a] for a in nuisance_axes] + out_rows: list[dict] = [] + + # --------------------------- iterate channels -------------------------- + for ch_val, df_ch in df.groupby(channel_key, sort=False, dropna=False): + # iterate bins of nuisance axes + if bin_cols: + if len(bin_cols) == 1: + gb = df_ch.groupby(bin_cols[0], sort=False, dropna=False) # avoid FutureWarning + else: + gb = df_ch.groupby(bin_cols, sort=False, dropna=False) + else: + df_ch = df_ch.copy() + df_ch["__bin_dummy__"] = 0 + gb = df_ch.groupby(["__bin_dummy__"], sort=False, dropna=False) + + for bin_key, df_cell in gb: + if not isinstance(bin_key, tuple): + bin_key = (bin_key,) + + # select non-outliers + keep = (df_cell[mask_col_use] == False) if mask_col_use in df_cell.columns else np.ones(len(df_cell), dtype=bool) + n_keep = int(keep.sum()) + masked_frac = 1.0 - float(keep.mean()) + + X_all = df_cell.loc[keep, "X"].to_numpy(np.float64) + Q_all = df_cell.loc[keep, "Q"].to_numpy(np.float64) + + # If too few points overall, emit NaNs for all q-centers in this cell + if n_keep < 6: + for q0 in q_centers: + row = { + "channel_id": ch_val, + "q_center": float(q0), + "a": np.nan, "b": np.nan, "sigma_Q": np.nan, + "sigma_Q_irr": np.nan, "dX_dN": np.nan, + "fit_stats": json.dumps({"n_used": n_keep, "ok": False, "reason": "cell_n<6", "masked_frac": float(masked_frac)}) + } + for ax_i, ax in enumerate(nuisance_axes): + row[f"{ax}_center"] = float(axis_to_centers[ax][bin_key[ax_i]]) + if timestamp is not None: + row["timestamp"] = timestamp + out_rows.append(row) + continue + + # -------------------- per-q_center sliding window -------------------- + for q0 in q_centers: + in_win = (Q_all >= q0 - dq) & (Q_all <= q0 + dq) + n_win = int(in_win.sum()) + + # window-local b_min (compute BEFORE branching) + if b_min_option == "auto": + if n_win > 1: + sigmaX_win = float(np.std(X_all[in_win])) + else: + sigmaX_win = float(np.std(X_all)) if X_all.size > 1 else 0.0 + bmin = _auto_b_min(sigmaX_win, dq) + else: + bmin = float(b_min_value) + + if n_win < 6: + row = { + "channel_id": ch_val, + "q_center": float(q0), + "a": np.nan, "b": np.nan, "sigma_Q": np.nan, + "sigma_Q_irr": np.nan, "dX_dN": np.nan, + "fit_stats": json.dumps({ + "n_used": n_win, "ok": False, "reason": "win_n<6", + "masked_frac": float(masked_frac), "b_min": float(bmin) + }) + } + else: + a, b, sigX, n_used, stats = _local_fit_delta_q(Q_all[in_win], X_all[in_win], q0) + + # If fit is NOT ok (e.g. low_Q_spread), keep NaNs (do NOT floor here) + if not bool(stats.get("ok", True)): + row = { + "channel_id": ch_val, + "q_center": float(q0), + "a": np.nan, "b": np.nan, "sigma_Q": np.nan, + "sigma_Q_irr": np.nan, "dX_dN": np.nan, + "fit_stats": json.dumps({ + **stats, "ok": False, "n_used": int(n_used), + "masked_frac": float(masked_frac), "b_min": float(bmin) + }) + } + else: + # Valid fit: enforce b floor only if b<=0 (monotonicity) + clipped = False + if not np.isfinite(b) or b <= 0.0: + b = max(bmin, 1e-9) + clipped = True + + sigma_Q = _sigma_Q_from_sigmaX(b, sigX) + fit_stats = { + **stats, + "n_used": int(n_used), + "ok": True, + "masked_frac": float(masked_frac), + "clipped": bool(clipped), + "b_min": float(bmin), + } + row = { + "channel_id": ch_val, + "q_center": float(q0), + "a": float(a), "b": float(b), "sigma_Q": float(sigma_Q), + "sigma_Q_irr": np.nan, "dX_dN": np.nan, + "fit_stats": json.dumps(fit_stats) + } + + # write nuisance centers and optional timestamp + for ax_i, ax in enumerate(nuisance_axes): + row[f"{ax}_center"] = float(axis_to_centers[ax][bin_key[ax_i]]) + if timestamp is not None: + row["timestamp"] = timestamp + out_rows.append(row) + + table = pd.DataFrame(out_rows) + + # ------------------------------ metadata ------------------------------ + table.attrs.update({ + "model": "X = a + b*(Q - q_center)", + "dq": float(dq), + "b_min_option": b_min_option, + "b_min_value": float(b_min_value), + "fit_mode": fit_mode, + "kappa_w": float(kappa_w), + "axes": ["q"] + list(nuisance_axes.keys()), + "channel_key": channel_key, + }) + + # --------- finite-difference derivatives along nuisance axes ---------- + for ax in nuisance_axes: + der_col = f"db_d{ax}" + table[der_col] = np.nan + for (ch, q0), g in table.groupby(["channel_id", "q_center"], sort=False): + centers = np.unique(g[f"{ax}_center"].to_numpy(np.float64)) + if centers.size < 2: + continue + gg = g.sort_values(f"{ax}_center") + bvals = gg["b"].to_numpy(np.float64) + xc = gg[f"{ax}_center"].to_numpy(np.float64) + d = np.full_like(bvals, np.nan) + if bvals.size >= 2: + d[0] = (bvals[1] - bvals[0]) / max(xc[1] - xc[0], 1e-12) + d[-1] = (bvals[-1] - bvals[-2]) / max(xc[-1] - xc[-2], 1e-12) + if bvals.size >= 3: + for i in range(1, bvals.size - 1): + d[i] = (bvals[i+1] - bvals[i-1]) / max(xc[i+1] - xc[i-1], 1e-12) + table.loc[gg.index, der_col] = d + + return table + + + +# --------------------------- Evaluator API ------------------------------- + +@dataclass +class QuantileEvaluator: + table: pd.DataFrame + + def __post_init__(self): + self._build_index() + + def _build_index(self): + t = self.table + if "channel_id" not in t.columns or "q_center" not in t.columns: + raise ValueError("Calibration table missing 'channel_id' or 'q_center'.") + # detect nuisance axes from columns ending with _center, but EXCLUDE q_center + self.axes = [] + for c in t.columns: + if c.endswith("_center") and c != "q_center": + self.axes.append(c[:-7]) # strip '_center' + self.q_centers = np.sort(t["q_center"].unique()) + # map channel -> nested dicts of arrays over (q, axis1, axis2, ...) + self.store: Dict[Any, Dict[str, Any]] = {} + for ch, gch in t.groupby("channel_id", sort=False): + # build sorted grids per axis + axis_centers = {ax: np.sort(gch[f"{ax}_center"].unique()) for ax in self.axes} + # allocate arrays + shape = (len(self.q_centers),) + tuple(len(axis_centers[ax]) for ax in self.axes) + A = np.full(shape, np.nan, dtype=np.float64) + B = np.full(shape, np.nan, dtype=np.float64) + SQ = np.full(shape, np.nan, dtype=np.float64) + # fill + for _, row in gch.iterrows(): + qi = int(np.where(self.q_centers == row["q_center"])[0][0]) + idx = [qi] + for ax in self.axes: + ci = int(np.where(axis_centers[ax] == row[f"{ax}_center"])[0][0]) + idx.append(ci) + idx = tuple(idx) + A[idx] = row["a"] + B[idx] = row["b"] + SQ[idx] = row["sigma_Q"] + self.store[ch] = {"A": A, "B": B, "SQ": SQ, "axes": axis_centers} + + def _interp_nuisance_vector(self, arr: np.ndarray, coords: Dict[str, float]) -> np.ndarray: + """Reduce arr over nuisance axes via chained 1D linear interpolation; returns vector over q.""" + out = arr + for ax_i, ax in enumerate(self.axes, start=1): + centers = self.store_axis_centers(ax) + # move this axis to last + out = np.moveaxis(out, ax_i, -1) + shp = out.shape[:-1] + reduced = np.empty(shp, dtype=np.float64) + for idx in np.ndindex(shp): + yc = out[idx] + reduced[idx] = _linear_interp_1d(centers, yc, coords.get(ax, float(centers[len(centers)//2]))) + out = reduced + # out shape -> (len(q_centers),) + return out + + def store_axis_centers(self, ax: str) -> np.ndarray: + # assumes all channels share same set; take from first channel + for ch in self.store: + return self.store[ch]["axes"][ax] + return np.array([], dtype=np.float64) + + def params(self, *, channel_id: Any, q: float, **coords) -> Tuple[float, float, float]: + item = self.store.get(channel_id) + if item is None: + return np.nan, np.nan, np.nan + a_vec = self._interp_nuisance_vector(item["A"], coords) # vector over q-centers + b_vec = self._interp_nuisance_vector(item["B"], coords) + s_vec = self._interp_nuisance_vector(item["SQ"], coords) + # interpolate across q-centers + a = _linear_interp_1d(self.q_centers, a_vec, q) + b = _linear_interp_1d(self.q_centers, b_vec, q) + s = _linear_interp_1d(self.q_centers, s_vec, q) + # monotonicity safeguard (clip b) + if not np.isfinite(b) or b <= 0.0: + # try minimal positive value to avoid NaN + b = 1e-9 + return float(a), float(b), float(s) + + def invert_rank(self, X: float, *, channel_id: Any, **coords) -> float: + """ + Invert amplitude -> rank using a monotone, piecewise-blended segment model: + For q in [q_k, q_{k+1}], define + X_blend(q) = (1-t)*(a_k + b_k*(q - q_k)) + t*(a_{k+1} + b_{k+1}*(q - q_{k+1})), + t = (q - q_k) / (q_{k+1} - q_k). + With b_k>0, X_blend is monotone increasing => solve X_blend(q)=X via bisection. + Returns q in [0,1] or NaN if no information is available. + """ + item = self.store.get(channel_id) + if item is None: + return np.nan + + qc = self.q_centers + if qc.size < 2: + return np.nan + + # Interpolate nuisance -> vectors over q-centers + a_vec = self._interp_nuisance_vector(item["A"], coords) + b_vec = self._interp_nuisance_vector(item["B"], coords) + + # Fill NaNs across q using linear interpolation on valid centers + valid = np.isfinite(a_vec) & np.isfinite(b_vec) & (b_vec > 0.0) + if valid.sum() < 2: + return np.nan + + def _fill1d(xc, y): + v = np.isfinite(y) + if v.sum() == 0: + return y + if v.sum() == 1: + # only one point: flat fill + y2 = np.full_like(y, y[v][0]) + return y2 + y2 = np.array(y, dtype=np.float64, copy=True) + y2[~v] = np.interp(xc[~v], xc[v], y[v]) + return y2 + + a_f = _fill1d(qc, a_vec) + b_f = _fill1d(qc, b_vec) + # enforce positive floor to keep monotonicity + b_f = np.where(np.isfinite(b_f) & (b_f > 0.0), b_f, 1e-9) + + # Fast helpers for segment evaluation + def X_blend(q: float) -> float: + # find segment + if q <= qc[0]: + k = 0 + elif q >= qc[-1]: + k = qc.size - 2 + else: + k = int(np.clip(np.searchsorted(qc, q) - 1, 0, qc.size - 2)) + qk, qk1 = qc[k], qc[k + 1] + t = (q - qk) / (qk1 - qk) if qk1 > qk else 0.0 + ak, bk = a_f[k], b_f[k] + ak1, bk1 = a_f[k + 1], b_f[k + 1] + xk = ak + bk * (q - qk) + xk1 = ak1 + bk1 * (q - qk1) + return float((1.0 - t) * xk + t * xk1) + + # Bracket on [0,1] + f0 = X_blend(0.0) - X + f1 = X_blend(1.0) - X + if not np.isfinite(f0) or not np.isfinite(f1): + return np.nan + + # If not bracketed, clamp to nearest end (rare with our synthetic noise) + if f0 == 0.0: + return 0.0 + if f1 == 0.0: + return 1.0 + if f0 > 0.0 and f1 > 0.0: + return 0.0 + if f0 < 0.0 and f1 < 0.0: + return 1.0 + + # Bisection + lo, hi = 0.0, 1.0 + flo, fhi = f0, f1 + for _ in range(40): + mid = 0.5 * (lo + hi) + fm = X_blend(mid) - X + if not np.isfinite(fm): + break + # root in [lo, mid] ? + if (flo <= 0.0 and fm >= 0.0) or (flo >= 0.0 and fm <= 0.0): + hi, fhi = mid, fm + else: + lo, flo = mid, fm + if abs(hi - lo) < 1e-6: + break + return float(0.5 * (lo + hi)) + + + +# ------------------------------ I/O helpers ------------------------------ + +def save_table(df: pd.DataFrame, path: str, fmt: str = "parquet") -> None: + fmt = fmt.lower() + if fmt == "parquet": + df.to_parquet(path, index=False) + elif fmt == "arrow": + import pyarrow as pa, pyarrow.ipc as ipc # noqa + table = pa.Table.from_pandas(df, preserve_index=False) + with ipc.new_file(path, table.schema) as writer: + writer.write(table) + elif fmt == "root": + try: + import uproot # noqa + except Exception as e: + raise RuntimeError("ROOT export requires 'uproot' or PyROOT.") from e + # minimal ROOT writer via uproot (one-shot) + with uproot.recreate(path) as f: + f["quantile_fit_nd"] = df + else: + raise ValueError(f"Unsupported fmt='{fmt}'") + + +def load_table(path: str, fmt: Optional[str] = None) -> pd.DataFrame: + if fmt is None: + if path.endswith(".parquet"): + fmt = "parquet" + elif path.endswith(".arrow") or path.endswith(".feather"): + fmt = "arrow" + elif path.endswith(".root"): + fmt = "root" + else: + fmt = "parquet" + fmt = fmt.lower() + if fmt == "parquet": + return pd.read_parquet(path) + elif fmt == "arrow": + import pyarrow as pa, pyarrow.ipc as ipc # noqa + with ipc.open_file(path) as reader: + t = reader.read_all() + return t.to_pandas() + elif fmt == "root": + import uproot # noqa + with uproot.open(path) as f: + # first TTree + keys = [k for k in f.keys() if k.endswith(";1")] + if not keys: + raise RuntimeError("No TTrees found in ROOT file") + return f[keys[0]].arrays(library="pd") + else: + raise ValueError(f"Unsupported fmt='{fmt}'") diff --git a/UTILS/dfextensions/quantile_fit_nd/test.log b/UTILS/dfextensions/quantile_fit_nd/test.log new file mode 100644 index 000000000..e1ad36457 --- /dev/null +++ b/UTILS/dfextensions/quantile_fit_nd/test.log @@ -0,0 +1,96 @@ + +=== Per–z-bin diagnostics (dist=uniform, N=5000) === +z_center | b_expected | b_meas_w | SE_pred(6σ) | |Δb|/SE6 | windows | clipped% +-10.000 | 48.056 | 49.236 | 38.761 | 0.030 | 11 | 0.00 + -7.778 | 48.480 | 50.779 | 29.501 | 0.078 | 11 | 0.00 + -5.556 | 48.898 | 48.954 | 26.448 | 0.002 | 11 | 0.00 + -3.333 | 49.354 | 48.876 | 18.367 | 0.026 | 11 | 0.00 + -1.111 | 49.786 | 51.573 | 15.788 | 0.113 | 11 | 0.00 + 1.111 | 50.214 | 50.275 | 19.083 | 0.003 | 11 | 0.00 + 3.333 | 50.649 | 51.742 | 19.846 | 0.055 | 11 | 0.00 + 5.556 | 51.096 | 52.493 | 22.533 | 0.062 | 11 | 0.00 + 7.778 | 51.539 | 51.703 | 28.169 | 0.006 | 11 | 0.00 + 10.000 | 51.957 | 49.482 | 46.666 | 0.053 | 11 | 0.00 +sigma_Q: mean relative error = 0.195 +Round-trip residuals: RMS=0.0107, MAD=0.0061, p10=-0.0138, p90=0.0134 +. +=== Per–z-bin diagnostics (dist=poisson, N=5000) === +z_center | b_expected | b_meas_w | SE_pred(6σ) | |Δb|/SE6 | windows | clipped% +-10.000 | 48.048 | 48.685 | 37.280 | 0.017 | 11 | 0.00 + -7.778 | 48.465 | 48.831 | 32.758 | 0.011 | 11 | 0.00 + -5.556 | 48.900 | 48.421 | 22.441 | 0.021 | 11 | 0.00 + -3.333 | 49.348 | 48.936 | 18.264 | 0.023 | 11 | 0.00 + -1.111 | 49.784 | 49.016 | 17.990 | 0.043 | 11 | 0.00 + 1.111 | 50.215 | 50.240 | 17.426 | 0.001 | 11 | 0.00 + 3.333 | 50.652 | 50.640 | 23.630 | 0.001 | 11 | 0.00 + 5.556 | 51.086 | 49.146 | 25.787 | 0.075 | 11 | 0.00 + 7.778 | 51.521 | 50.428 | 39.547 | 0.028 | 11 | 0.00 + 10.000 | 51.954 | 51.819 | 35.544 | 0.004 | 11 | 0.00 +sigma_Q: mean relative error = 0.234 +Round-trip residuals: RMS=0.0105, MAD=0.0065, p10=-0.0131, p90=0.0125 +. +=== Per–z-bin diagnostics (dist=gaussian, N=5000) === +z_center | b_expected | b_meas_w | SE_pred(6σ) | |Δb|/SE6 | windows | clipped% +-10.000 | 48.044 | 45.445 | 36.983 | 0.070 | 11 | 0.00 + -7.778 | 48.467 | 51.346 | 30.982 | 0.093 | 11 | 0.00 + -5.556 | 48.905 | 47.117 | 21.779 | 0.082 | 11 | 0.00 + -3.333 | 49.346 | 49.467 | 19.741 | 0.006 | 11 | 0.00 + -1.111 | 49.781 | 47.995 | 19.344 | 0.092 | 11 | 0.00 + 1.111 | 50.223 | 50.531 | 17.426 | 0.018 | 11 | 0.00 + 3.333 | 50.653 | 50.197 | 19.331 | 0.024 | 11 | 0.00 + 5.556 | 51.089 | 52.031 | 24.438 | 0.039 | 11 | 0.00 + 7.778 | 51.540 | 50.049 | 32.818 | 0.045 | 11 | 0.00 + 10.000 | 51.964 | 50.528 | 44.324 | 0.032 | 11 | 0.00 +sigma_Q: mean relative error = 0.227 +Round-trip residuals: RMS=0.0096, MAD=0.0063, p10=-0.0124, p90=0.0120 +. +=== Per–z-bin diagnostics (dist=uniform, N=50000) === +z_center | b_expected | b_meas_w | SE_pred(6σ) | |Δb|/SE6 | windows | clipped% +-10.000 | 48.044 | 48.625 | 11.533 | 0.050 | 11 | 0.00 + -7.778 | 48.466 | 48.704 | 9.667 | 0.025 | 11 | 0.00 + -5.556 | 48.908 | 49.881 | 7.068 | 0.138 | 11 | 0.00 + -3.333 | 49.345 | 49.400 | 5.854 | 0.009 | 11 | 0.00 + -1.111 | 49.779 | 49.666 | 5.393 | 0.021 | 11 | 0.00 + 1.111 | 50.218 | 50.696 | 5.379 | 0.089 | 11 | 0.00 + 3.333 | 50.658 | 50.616 | 5.961 | 0.007 | 11 | 0.00 + 5.556 | 51.090 | 51.635 | 7.125 | 0.076 | 11 | 0.00 + 7.778 | 51.531 | 52.007 | 9.433 | 0.050 | 11 | 0.00 + 10.000 | 51.951 | 51.461 | 11.885 | 0.041 | 11 | 0.00 +sigma_Q: mean relative error = 0.216 +Round-trip residuals: RMS=0.0102, MAD=0.0070, p10=-0.0126, p90=0.0132 +. +=== Per–z-bin diagnostics (dist=poisson, N=50000) === +z_center | b_expected | b_meas_w | SE_pred(6σ) | |Δb|/SE6 | windows | clipped% +-10.000 | 48.050 | 48.036 | 11.973 | 0.001 | 11 | 0.00 + -7.778 | 48.465 | 49.254 | 9.612 | 0.082 | 11 | 0.00 + -5.556 | 48.908 | 49.103 | 7.389 | 0.026 | 11 | 0.00 + -3.333 | 49.346 | 49.667 | 5.971 | 0.054 | 11 | 0.00 + -1.111 | 49.783 | 50.098 | 5.408 | 0.058 | 11 | 0.00 + 1.111 | 50.217 | 50.224 | 5.284 | 0.001 | 11 | 0.00 + 3.333 | 50.659 | 50.869 | 5.858 | 0.036 | 11 | 0.00 + 5.556 | 51.093 | 51.244 | 7.199 | 0.021 | 11 | 0.00 + 7.778 | 51.529 | 52.119 | 9.594 | 0.061 | 11 | 0.00 + 10.000 | 51.952 | 52.148 | 11.380 | 0.017 | 11 | 0.00 +sigma_Q: mean relative error = 0.212 +Round-trip residuals: RMS=0.0097, MAD=0.0061, p10=-0.0134, p90=0.0126 +. +=== Per–z-bin diagnostics (dist=gaussian, N=50000) === +z_center | b_expected | b_meas_w | SE_pred(6σ) | |Δb|/SE6 | windows | clipped% +-10.000 | 48.047 | 47.983 | 11.465 | 0.006 | 11 | 0.00 + -7.778 | 48.468 | 49.065 | 9.193 | 0.065 | 11 | 0.00 + -5.556 | 48.908 | 48.766 | 7.118 | 0.020 | 11 | 0.00 + -3.333 | 49.343 | 49.024 | 5.873 | 0.054 | 11 | 0.00 + -1.111 | 49.782 | 50.018 | 5.325 | 0.044 | 11 | 0.00 + 1.111 | 50.217 | 50.396 | 5.166 | 0.035 | 11 | 0.00 + 3.333 | 50.657 | 50.822 | 5.942 | 0.028 | 11 | 0.00 + 5.556 | 51.088 | 51.223 | 7.055 | 0.019 | 11 | 0.00 + 7.778 | 51.529 | 51.175 | 10.191 | 0.035 | 11 | 0.00 + 10.000 | 51.952 | 51.706 | 11.201 | 0.022 | 11 | 0.00 +sigma_Q: mean relative error = 0.228 +Round-trip residuals: RMS=0.0104, MAD=0.0070, p10=-0.0136, p90=0.0127 +. +=== Edge coverage diagnostics === +predicted feasible fraction = 0.458, 6σ lower bound = 0.000, measured finite fraction = 0.458 +edge positive-b fraction = 0.458 +. +7 passed in 2.33s diff --git a/UTILS/dfextensions/quantile_fit_nd/test_quantile_fit_nd.py b/UTILS/dfextensions/quantile_fit_nd/test_quantile_fit_nd.py new file mode 100644 index 000000000..455fd99a8 --- /dev/null +++ b/UTILS/dfextensions/quantile_fit_nd/test_quantile_fit_nd.py @@ -0,0 +1,268 @@ +import json +import numpy as np +import pandas as pd +import pytest + +from dfextensions.quantile_fit_nd.quantile_fit_nd import ( + fit_quantile_linear_nd, + QuantileEvaluator, +) +from dfextensions.quantile_fit_nd.utils import discrete_to_uniform_rank_poisson + + +RNG = np.random.default_rng(42) + + +def gen_Q_from_distribution(dist: str, n: int, params: dict) -> np.ndarray: + if dist == "uniform": + return RNG.uniform(0.0, 1.0, size=n) + elif dist == "poisson": + lam = params.get("lam", 20.0) + k = RNG.poisson(lam, size=n) + return discrete_to_uniform_rank_poisson(k, lam, mode="randomized", rng=RNG) + elif dist == "gaussian": + mu = params.get("mu", 0.0) + sigma = params.get("sigma", 1.0) + g = RNG.normal(mu, sigma, size=n) + from math import erf + z = (g - mu) / max(sigma, 1e-9) + cdf = 0.5 * (1.0 + np.array([erf(zi / np.sqrt(2)) for zi in z])) + return np.clip(cdf, 0.0, 1.0) + else: + raise ValueError(f"unknown dist {dist}") + + +def gen_synthetic_df( + n: int, + dist: str = "uniform", + z_sigma_cm: float = 5.0, + z_range_cm: float = 10.0, + sigma_X_given_Q: float = 0.5, + a0: float = 10.0, + a1: float = 0.5, + b0: float = 50.0, + b1: float = 2.0, +) -> tuple[pd.DataFrame, dict]: + Q = gen_Q_from_distribution(dist, n, params={"lam": 20.0, "mu": 0.0, "sigma": 1.0}) + z = np.clip(RNG.normal(0.0, z_sigma_cm, size=n), -z_range_cm, z_range_cm) + a_true = a0 + a1 * z + b_true = (b0 + b1 * z / max(z_range_cm, 1e-6)).clip(min=5.0) + X = a_true + b_true * Q + RNG.normal(0.0, sigma_X_given_Q, size=n) + df = pd.DataFrame({ + "channel_id": np.repeat("ch0", n), + "Q": Q, + "X": X, + "z_vtx": z, + "is_outlier": np.zeros(n, dtype=bool), + }) + truth = { + "a0": a0, "a1": a1, + "b0": b0, "b1": b1, + "sigma_X_given_Q": sigma_X_given_Q, + "z_range": z_range_cm, + } + return df, truth + + +def _edges_from_centers(centers: np.ndarray) -> np.ndarray: + mid = 0.5 * (centers[1:] + centers[:-1]) + first = centers[0] - (mid[0] - centers[0]) + last = centers[-1] + (centers[-1] - mid[-1]) + return np.concatenate([[first], mid, [last]]) + + +def _expected_b_per_zbin_from_sample(df: pd.DataFrame, z_edges: np.ndarray, truth: dict) -> np.ndarray: + z_vals = df["z_vtx"].to_numpy(np.float64) + b_true_all = (truth["b0"] + truth["b1"] * z_vals / max(truth["z_range"], 1e-6)).clip(min=5.0) + b_expected = [] + for i in range(len(z_edges) - 1): + m = (z_vals >= z_edges[i]) & (z_vals <= z_edges[i+1]) + b_expected.append(np.mean(b_true_all[m]) if m.sum() > 0 else np.nan) + return np.array(b_expected, dtype=np.float64) + + +def _predicted_se_b_per_zbin(df: pd.DataFrame, z_edges: np.ndarray, q_centers: np.ndarray, dq: float, sigma_X_given_Q: float) -> np.ndarray: + Q_all = df["Q"].to_numpy(np.float64) + z_all = df["z_vtx"].to_numpy(np.float64) + + se_bins = np.full(len(z_edges) - 1, np.nan, dtype=np.float64) + + for i in range(len(z_edges) - 1): + m_z = (z_all >= z_edges[i]) & (z_all <= z_edges[i+1]) + if m_z.sum() < 10: + continue + Qz = Q_all[m_z] + + se_ws = [] + for q0 in q_centers: + in_win = (Qz >= q0 - dq) & (Qz <= q0 + dq) + n_win = int(in_win.sum()) + if n_win < 6: + continue + dq_i = Qz[in_win] - q0 + sxx = float(np.sum((dq_i - dq_i.mean())**2)) + if sxx <= 0: + continue + se_b = sigma_X_given_Q / np.sqrt(max(sxx, 1e-12)) + se_ws.append(se_b) + + if len(se_ws) == 0: + continue + se_bins[i] = float(np.sqrt(np.mean(np.square(se_ws)))) # conservative RMS + + return se_bins + + +def _json_stats_to_arrays(subtable: pd.DataFrame) -> tuple[np.ndarray, np.ndarray]: + """Extract weights (n_used) and clipped flags from fit_stats JSON.""" + n_used = [] + clipped = [] + for s in subtable["fit_stats"]: + try: + d = json.loads(s) + except Exception: + d = {} + n_used.append(d.get("n_used", np.nan)) + clipped.append(bool(d.get("clipped", False))) + return np.array(n_used, dtype=float), np.array(clipped, dtype=bool) + + +@pytest.mark.parametrize("dist", ["uniform", "poisson", "gaussian"]) +@pytest.mark.parametrize("n_points", [5_000, 50_000]) +def test_fit_and_sigmaQ(dist, n_points): + df, truth = gen_synthetic_df(n_points, dist=dist) + q_centers = np.linspace(0.0, 1.0, 11) + dq = 0.05 + + table = fit_quantile_linear_nd( + df, + channel_key="channel_id", + q_centers=q_centers, + dq=dq, + nuisance_axes={"z": "z_vtx"}, + n_bins_axes={"z": 10}, + ) + assert not table.empty + assert {"a", "b", "sigma_Q", "z_center", "q_center", "fit_stats"}.issubset(table.columns) + + # Expected b(z) from sample, using fit's z-bin edges + z_centers = np.sort(table["z_center"].unique()) + z_edges = _edges_from_centers(z_centers) + b_expected = _expected_b_per_zbin_from_sample(df, z_edges, truth) + + # Weighted measured b(z) using window counts (n_used) per (z,q) cell + b_meas_w = np.full_like(b_expected, np.nan, dtype=float) + se_pred = _predicted_se_b_per_zbin(df, z_edges, q_centers, dq, sigma_X_given_Q=truth["sigma_X_given_Q"]) + print("\n=== Per–z-bin diagnostics (dist={}, N={}) ===".format(dist, n_points)) + print("z_center | b_expected | b_meas_w | SE_pred(6σ) | |Δb|/SE6 | windows | clipped%") + + for i, zc in enumerate(z_centers): + g = table[table["z_center"] == zc] + if g.empty: + continue + weights, clipped = _json_stats_to_arrays(g) + # Only use rows with finite b and positive weights + ok = np.isfinite(g["b"].to_numpy()) & (weights > 0) + if ok.sum() == 0: + continue + w = weights[ok] + bvals = g["b"].to_numpy()[ok] + b_meas_w[i] = np.average(bvals, weights=w) + + # Diagnostics + se6 = 6.0 * se_pred[i] if np.isfinite(se_pred[i]) else np.nan + db = abs((b_meas_w[i] - b_expected[i])) if np.isfinite(b_expected[i]) and np.isfinite(b_meas_w[i]) else np.nan + ratio = (db / se6) if (np.isfinite(db) and np.isfinite(se6) and se6 > 0) else np.nan + clip_pct = 100.0 * (clipped[ok].mean() if ok.size else 0.0) + + print(f"{zc:7.3f} | {b_expected[i]:10.3f} | {b_meas_w[i]:8.3f} | {se6:10.3f} | {ratio:7.3f} | {ok.sum():7d} | {clip_pct:7.2f}") + + # 6σ check across valid bins + ok_mask = np.isfinite(b_expected) & np.isfinite(b_meas_w) & np.isfinite(se_pred) + assert ok_mask.any(), "No valid z-bins to compare." + abs_diff = np.abs(b_meas_w[ok_mask] - b_expected[ok_mask]) + bound6 = 6.0 * se_pred[ok_mask] + # report worst-case ratio for debug + worst = float(np.nanmax(abs_diff / np.maximum(bound6, 1e-12))) + assert np.all(abs_diff <= (bound6 + 1e-12)), f"6σ slope check failed in at least one z-bin: max(|Δb|/(6·SE)) = {worst:.2f}" + + # sigma_Q vs truth (pragmatic bound) + sigma_q_true = truth["sigma_X_given_Q"] / np.maximum(1e-9, b_expected) + sigma_q_meas = table.groupby("z_center")["sigma_Q"].median().reindex(z_centers).to_numpy() + m_ok = np.isfinite(sigma_q_true) & np.isfinite(sigma_q_meas) + rel_err_sig = np.nanmean(np.abs(sigma_q_meas[m_ok] - sigma_q_true[m_ok]) / np.maximum(1e-9, sigma_q_true[m_ok])) + print(f"sigma_Q: mean relative error = {rel_err_sig:.3f}") + assert rel_err_sig < 0.30, f"sigma_Q rel err too large: {rel_err_sig:.3f}" + + # Round-trip Q->X->Q diagnostics + evalr = QuantileEvaluator(table) + idx = np.linspace(0, len(df) - 1, num=300, dtype=int) + resid = [] + for irow in idx: + z = float(df.loc[irow, "z_vtx"]) + q_true = float(df.loc[irow, "Q"]) + x = float(df.loc[irow, "X"]) + q_hat = evalr.invert_rank(x, channel_id="ch0", z=z) + resid.append(q_hat - q_true) + resid = np.array(resid, dtype=float) + rms = float(np.sqrt(np.mean(np.square(resid)))) + mad = float(np.median(np.abs(resid - np.median(resid)))) + q10, q90 = float(np.quantile(resid, 0.10)), float(np.quantile(resid, 0.90)) + print(f"Round-trip residuals: RMS={rms:.4f}, MAD={mad:.4f}, p10={q10:.4f}, p90={q90:.4f}") + assert rms < 0.07, f"round-trip Q residual RMS too large: {rms:.3f}" + + +def test_edges_behavior(): + # Heavily edge-concentrated Q distribution + n = 20000 + Q = np.concatenate([np.clip(RNG.normal(0.02, 0.01, n//2), 0, 1), + np.clip(RNG.normal(0.98, 0.01, n//2), 0, 1)]) + z = RNG.normal(0.0, 5.0, size=n) + a0, b0, sigma = 5.0, 40.0, 0.4 + X = a0 + b0 * Q + RNG.normal(0.0, sigma, size=n) + + df = pd.DataFrame({"channel_id": "chE", "Q": Q, "X": X, "z_vtx": z, "is_outlier": False}) + q_centers = np.linspace(0, 1, 11) + dq = 0.05 + n_zbins = 6 + + table = fit_quantile_linear_nd( + df, channel_key="channel_id", + q_centers=q_centers, dq=dq, + nuisance_axes={"z": "z_vtx"}, n_bins_axes={"z": n_zbins} + ) + + z_centers = np.sort(table["z_center"].unique()) + z_edges = _edges_from_centers(z_centers) + Q_all = df["Q"].to_numpy(np.float64) + z_all = df["z_vtx"].to_numpy(np.float64) + + edge_q = [0.0, 0.1, 0.9, 1.0] + feasible_flags = [] + for q0 in edge_q: + for i in range(len(z_edges) - 1): + m_z = (z_all >= z_edges[i]) & (z_all <= z_edges[i+1]) + Qz = Q_all[m_z] + n_win = int(((Qz >= q0 - dq) & (Qz <= q0 + dq)).sum()) + feasible_flags.append(n_win >= 6) + feasible_flags = np.array(feasible_flags, dtype=bool) + + predicted_frac = feasible_flags.mean() + measured_tbl = table[table["q_center"].isin(edge_q)] + measured_frac = np.isfinite(measured_tbl["b"]).mean() + + N = feasible_flags.size + se_binom = np.sqrt(max(predicted_frac * (1 - predicted_frac) / max(N, 1), 1e-12)) + lb = max(0.0, predicted_frac - 6.0 * se_binom) + + print("\n=== Edge coverage diagnostics ===") + print(f"predicted feasible fraction = {predicted_frac:.3f}, 6σ lower bound = {lb:.3f}, measured finite fraction = {measured_frac:.3f}") + + assert measured_frac >= lb, ( + f"finite fraction at edges too low: measured {measured_frac:.3f}, " + f"predicted {predicted_frac:.3f}, 6σ lower bound {lb:.3f}" + ) + + frac_pos = (measured_tbl["b"] > 0).mean() + print(f"edge positive-b fraction = {frac_pos:.3f}") + assert frac_pos > 0.2, f"positive b fraction too low: {frac_pos:.3f}" diff --git a/UTILS/dfextensions/quantile_fit_nd/utils.py b/UTILS/dfextensions/quantile_fit_nd/utils.py new file mode 100644 index 000000000..cc02e9c36 --- /dev/null +++ b/UTILS/dfextensions/quantile_fit_nd/utils.py @@ -0,0 +1,81 @@ +# dfextensions/quantile_fit_nd/utils.py +import numpy as np +from typing import Optional + +def discrete_to_uniform_rank_poisson( + k: np.ndarray, + lam: float, + mode: str = "randomized", + rng: Optional[np.random.Generator] = None, +) -> np.ndarray: + """ + Map Poisson counts k ~ Poisson(lam) to U ~ Uniform(0,1). + + mode="randomized" (preferred, exact PIT): + U = F(k-1) + V * (F(k) - F(k-1)), V ~ Unif(0,1) + mode="midrank" (deterministic): + U = 0.5 * (F(k-1) + F(k)) + + Returns U in [0,1]. + """ + k = np.asarray(k, dtype=np.int64) + if rng is None: + rng = np.random.default_rng() + + k_max = int(np.max(k)) if k.size else 0 + pmf = np.empty(k_max + 1, dtype=np.float64) + pmf[0] = np.exp(-lam) + for j in range(k_max): + pmf[j + 1] = pmf[j] * lam / (j + 1) + + cdf = np.cumsum(pmf) + cdf = np.clip(cdf, 0.0, 1.0) + + Fk = cdf[k] + Fkm1 = np.where(k > 0, cdf[k - 1], 0.0) + + if mode == "randomized": + u = rng.random(size=k.size) + U = Fkm1 + u * (Fk - Fkm1) + elif mode == "midrank": + U = 0.5 * (Fkm1 + Fk) + else: + raise ValueError(f"unknown mode {mode!r}") + + return np.clip(U, 0.0, 1.0) + + +def discrete_to_uniform_rank_empirical( + x: np.ndarray, + mode: str = "randomized", + rng: Optional[np.random.Generator] = None, +) -> np.ndarray: + """ + Generic discrete -> Uniform(0,1) using the empirical CDF of x. + + For unique value v with mass p_v and cumulative F(v): + randomized: U ~ Uniform(F(v-), F(v)) + midrank: U = 0.5 * (F(v-) + F(v)) + """ + x = np.asarray(x) + n = x.size + if rng is None: + rng = np.random.default_rng() + if n == 0: + return np.array([], dtype=np.float64) + + uniq, inv = np.unique(x, return_inverse=True) + counts = np.bincount(inv, minlength=uniq.size) + cum = np.cumsum(counts) + F_curr = cum / float(n) + F_prev = (cum - counts) / float(n) + + if mode == "randomized": + u = rng.random(size=n) + U = F_prev[inv] + u * (F_curr[inv] - F_prev[inv]) + elif mode == "midrank": + U = 0.5 * (F_prev[inv] + F_curr[inv]) + else: + raise ValueError(f"unknown mode {mode!r}") + + return np.clip(U, 0.0, 1.0) diff --git a/UTILS/dfextensions/restartContext_groupby_regression.md b/UTILS/dfextensions/restartContext_groupby_regression.md new file mode 100644 index 000000000..8790c22e7 --- /dev/null +++ b/UTILS/dfextensions/restartContext_groupby_regression.md @@ -0,0 +1,181 @@ +Short answer: **yes — a small update to `restartContext_groupby_regression.md` now will help** keep everyone aligned. We learned that response-only outliers don’t reproduce the slowdown; we set a new 5k/5 default; and we added a 30% scenario. That’s enough to warrant a “delta” refresh so Claude/others don’t chase the wrong thing. + +Here’s a **drop-in replacement** you can paste over the current file (keeps the same structure, updated content). If you prefer to append, you can paste the “What changed” + “Action plan” sections at the top. + +--- + +# Restart Context: GroupBy Regression Benchmarking & Documentation + +**Date:** October 22, 2025 +**Project:** dfextensions (ALICE O2 Physics) +**Focus:** `groupby_regression.py` — benchmarking & performance docs +**Target:** PR by Oct 25, 2025 +**Collaborators:** GPT (Primary Coder), Claude (Reviewer), User (Approval) + +--- + +## Executive Summary (updated) + +Benchmarks on synthetic data show that **response-only outliers (shift in y)** do **not** slow down the OLS/robust path; runtime remains essentially unchanged even at **30% contamination**. Both Mac and Linux show similar **scaling** (Linux ≈2–2.5× slower wall time per 1k groups due to platform factors). +The **real-data 25× slowdown** is therefore likely due to **sigmaCut-triggered robust re-fits driven by leverage outliers in X** and/or multi-target fits (e.g., `dX,dY,dZ`) compounding the cost. + +**New default benchmark:** **5,000 groups × 5 rows/group** (fast, representative). +**New scenarios:** include **30% outliers (5σ)** to demonstrate stability of response-only contamination. + +--- + +## What changed since last update + +* **Benchmark defaults:** `--rows-per-group 5 --groups 5000` adopted for docs & CI-friendly runs. +* **Scenarios:** Added **30% outliers (5σ)** in serial + parallel. +* **Findings:** + + * Mac (per 1k groups): serial ~**1.69 s**, parallel(10) ~**0.50 s**. + * Linux (per 1k groups): serial ~**4.14 s**, parallel(10) ~**0.98 s**. + * 5–30% response outliers: **no runtime increase** vs clean. +* **Conclusion:** Synthetic setup doesn’t trigger the **re-fit loop**; real data likely has **leverage** characteristics or different fitter path. + +--- + +## Problem Statement (refined) + +Observed **~25× slowdown** on real datasets when using `sigmaCut` robust filtering. Root cause is presumed **iterative re-fitting per group** when the mask updates (MAD-based) repeatedly exclude many points — common under **leverage outliers in X** or mixed contamination (X & y). Multi-target fitting (e.g., 3 targets) likely multiplies cost. + +--- + +## Cross-Platform Note + +Linux runs are **~2–2.5×** slower in absolute time than Mac, but **parallel speed-ups are consistent** (~4–5×). Differences are due to CPU/BLAS/spawn model (Apptainer), not algorithmic changes. + +--- + +## Action Plan (next 48h) + +1. **Add leverage-outlier generator** to benchmarks + + * API: `create_data_with_outliers(..., mode="response|leverage|both", x_mag=8.0)` + * Goal: Reproduce sigmaCut re-fit slow path (target 10–25×). +2. **Instrument the fitter** + + * Add counters in `process_group_robust()`: + + * `n_refits`, `mask_fraction`, and per-group timings. + * Emit aggregated stats in `dfGB` (or a side JSON) for diagnostics. +3. **Multi-target cost check** + + * Run with `fit_columns=['dX']`, then `['dX','dY','dZ']` to quantify multiplicative cost. +4. **Config toggles for mitigation** (document in perf section) + + * `sigmaCut=100` (disable re-fit) as a “fast path” when upstream filtering is trusted. + * Optional `max_refits` (cap iterations), log a warning when hit. + * Consider `fitter='huber'` fast-path if available. +5. **Finalize docs** + + * Keep 5k/5 as **doc default**; show Mac+Linux tables. + * Add a **“Stress Test (Leverage)”** table once generator is merged. + +--- + +## Deliverables Checklist + +* [x] Single-file benchmark with 5k/5 default & 30% outlier scenarios +* [x] Performance section in `groupby_regression.md` (Mac/Linux tables) +* [ ] **Leverage-outlier generator** (+ scenarios) +* [ ] Fitter instrumentation (refit counters, timings) +* [ ] Performance tests (CI thresholds for clean vs stress) +* [ ] `BENCHMARKS.md` with full runs & environment capture + +--- + +## Current Commands + +**Default quick run (docs/CI):** + +```bash +python3 bench_groupby_regression.py \ + --rows-per-group 5 --groups 5000 \ + --n-jobs 10 --sigmaCut 5 --fitter ols \ + --out bench_out --emit-csv +``` + +**Stress test placeholder (to be added):** + +```bash +python3 bench_groupby_regression.py \ + --rows-per-group 5 --groups 5000 \ + --n-jobs 10 --sigmaCut 5 --fitter ols \ + --mode leverage --x-mag 8.0 \ + --out bench_out_stress --emit-csv +``` + +--- + +## Risks & Open Questions + +* What outlier **structure** in real data triggers the re-fit? (X leverage? heteroscedasticity? group size variance?) +* Is the slowdown proportional to **targets × refits × groups**? +* Do container spawn/backends (forkserver/spawn) amplify overhead for very small groups? + +--- + +**Last updated:** Oct 22, 2025 (this revision) +# Restart Context: GroupBy Regression Benchmarking & Diagnostics Integration + +**Date:** October 23 2025 +**Project:** dfextensions (ALICE O2 Physics) +**Focus:** `groupby_regression.py` — diagnostic instrumentation and benchmark integration +**Next Phase:** Real-data performance characterization + +--- + +## Summary of Latest Changes + +* **Diagnostics added to core class** + - `GroupByRegressor.summarize_diagnostics()` and `format_diagnostics_summary()` now compute mean/median/std + quantiles (p50–p99) for all key diagnostic metrics (`time_ms`, `n_refits`, `frac_rejected`, `cond_xtx`, `hat_max`, `n_rows`). + - Handles both prefixed (`diag_…`) and suffixed (`…_fit`, `…_dIDC`) columns. + +* **Benchmark integration** + - `bench_groupby_regression.py` now: + - Calls class-level summary after each scenario. + - Writes per-scenario `diag_summary.csv` and appends human-readable summaries to `benchmark_report.txt`. + - Saves `diag_top10_time__.csv` and `diag_top10_refits__.csv` for quick inspection. + - Default benchmark: `--rows-per-group 5 --groups 1000 --diag`. + +* **Validation** + - Real-data summary confirmed correct suffix handling (`_dIDC`). + - Pytest and all synthetic benchmarks pass. + +--- + +## Observations from Real Data + +* Median per-group fit time ≈ 7 ms (p99 ≈ 12 ms). +* ~99 % of groups perform 3 robust re-fits → robust loop fully active. +* Only ~2 % mean rejection fraction, but 99th percentile ≈ 0.4 → a few heavy-outlier bins drive cost. +* Conditioning (cond_xtx ≈ 1) and leverage (hat_max ≈ 0.18) are stable → slowdown dominated by the sigmaCut iteration. + +--- + +## Next Steps (Real-Use-Case Phase) + +1. **Collect diagnostic distributions on full calibration samples** + - Export `diag_full__*` and `diag_top10_*` CSVs. + - Aggregate with `summarize_diagnostics()` to study tails and correlations. + +2. **Benchmark subsets vs. full parallel runs** + - Quantify the gain observed when splitting into smaller chunks (cache + spawn effects). + +3. **Add leverage-outlier generator** to reproduce re-fit behaviour in synthetic benchmarks. + +4. **Consider optimization paths** + - Cap `max_refits` / early-stop criterion. + - Introduce `make_parallel_fitFast` minimal version for groups O(10). + +5. **Documentation** + - Update `groupby_regression.md` “Performance & Benchmarking” section with diagnostic summary example and reference to top-violator CSVs. + +--- + +**Last updated:** Oct 23 2025 + + diff --git a/UTILS/dfextensions/test_groupby_regression.py b/UTILS/dfextensions/test_groupby_regression.py new file mode 100644 index 000000000..6765cbeaf --- /dev/null +++ b/UTILS/dfextensions/test_groupby_regression.py @@ -0,0 +1,420 @@ +import pytest +import pandas as pd +import numpy as np +#from groupby_regression import GroupByRegressor +from .groupby_regression import GroupByRegressor + +@pytest.fixture +def sample_data(): + np.random.seed(0) + n = 100 + df = pd.DataFrame({ + 'group': np.random.choice(['A', 'B'], size=n), + 'x1': np.random.normal(loc=0, scale=1, size=n), + 'x2': np.random.normal(loc=5, scale=2, size=n), + }) + df['y'] = 2.0 * df['x1'] + 3.0 * df['x2'] + np.random.normal(0, 0.5, size=n) + df['weight'] = np.ones(n) + return df + + +def test_make_linear_fit_basic(sample_data): + df = sample_data.copy() + df_out, dfGB = GroupByRegressor.make_linear_fit( + df, + gb_columns=['group'], + fit_columns=['y'], + linear_columns=['x1', 'x2'], + median_columns=['x1'], + suffix='_fit', + selection=(df['x1'] > -10), + addPrediction=True + ) + assert not dfGB.empty + assert 'y_fit' in df_out.columns + assert 'y_slope_x1_fit' in dfGB.columns + assert 'x1_fit' in dfGB.columns + + +def test_make_parallel_fit_robust(sample_data): + df = sample_data.copy() + df_out, dfGB = GroupByRegressor.make_parallel_fit( + df, + gb_columns=['group'], + fit_columns=['y'], + linear_columns=['x1', 'x2'], + median_columns=['x1'], + weights='weight', + suffix='_rob', + selection=(df['x1'] > -10), + addPrediction=True, + n_jobs=1, + min_stat=[5, 5] + ) + assert not dfGB.empty + assert 'y_rob' in df_out.columns + assert 'y_slope_x1_rob' in dfGB.columns + assert 'y_intercept_rob' in dfGB.columns + + +def test_insufficient_data(sample_data): + df = sample_data.copy() + df = df[df['group'] == 'A'].iloc[:5] # Force small group + df_out, dfGB = GroupByRegressor.make_linear_fit( + df, + gb_columns=['group'], + fit_columns=['y'], + linear_columns=['x1', 'x2'], + median_columns=['x1'], + suffix='_tiny', + selection=(df['x1'] > -10), + addPrediction=True, + min_stat=10 + ) + assert len(dfGB) <= 1 # Could be empty or single group with skipped fit + assert 'y_tiny' in df_out.columns + assert dfGB.get('y_slope_x1_tiny') is None or dfGB['y_slope_x1_tiny'].isna().all() + assert dfGB.get('y_intercept_tiny') is None or dfGB['y_intercept_tiny'].isna().all() + + +def test_prediction_accuracy(sample_data): + df = sample_data.copy() + df_out, dfGB = GroupByRegressor.make_linear_fit( + df, + gb_columns=['group'], + fit_columns=['y'], + linear_columns=['x1', 'x2'], + median_columns=['x1'], + suffix='_pred', + selection=(df['x1'] > -10), + addPrediction=True + ) + errors = df_out['y'] - df_out['y_pred'] + assert errors.std() < 1.0 # Should be close to noise level + + +def test_missing_values(): + df = pd.DataFrame({ + 'group': ['A', 'A', 'B', 'B'], + 'x1': [1.0, 2.0, np.nan, 4.0], + 'x2': [2.0, 3.0, 1.0, np.nan], + 'y': [5.0, 8.0, 4.0, 6.0], + 'weight': [1.0, 1.0, 1.0, 1.0] + }) + selection = df['x1'].notna() & df['x2'].notna() + df_out, dfGB = GroupByRegressor.make_linear_fit( + df, + gb_columns=['group'], + fit_columns=['y'], + linear_columns=['x1', 'x2'], + median_columns=['x1'], + suffix='_nan', + selection=selection, + addPrediction=True + ) + assert 'y_nan' in df_out.columns + assert df_out['y_nan'].isna().sum() >= 0 # No crash due to missing data + + +def test_cast_dtype_effect(): + df = pd.DataFrame({ + 'group': ['G1'] * 10, + 'x1': np.linspace(0, 1, 10), + 'x2': np.linspace(1, 2, 10), + }) + df['y'] = 2.0 * df['x1'] + 3.0 * df['x2'] + df['weight'] = 1.0 + + df_out, dfGB = GroupByRegressor.make_linear_fit( + df, + gb_columns=['group'], + fit_columns=['y'], + linear_columns=['x1', 'x2'], + median_columns=['x1'], + suffix='_typed', + selection=(df['x1'] >= 0), + addPrediction=True, + cast_dtype='float32' + ) + + assert dfGB['y_slope_x1_typed'].dtype == np.float32 + assert dfGB['y_slope_x2_typed'].dtype == np.float32 + + +def test_robust_outlier_resilience(): + np.random.seed(0) + x1 = np.random.uniform(0, 1, 100) + x2 = np.random.uniform(10, 20, 100) + y = 2.0 * x1 + 3.0 * x2 + y[::10] += 50 # Inject outliers every 10th sample + + df = pd.DataFrame({ + 'group': ['G1'] * 100, + 'x1': x1, + 'x2': x2, + 'y': y, + 'weight': 1.0 + }) + + _, df_robust = GroupByRegressor.make_parallel_fit( + df, + gb_columns=['group'], + fit_columns=['y'], + linear_columns=['x1', 'x2'], + median_columns=['x1'], + weights='weight', + suffix='_robust', + selection=(df['x1'] >= 0), + addPrediction=True, + n_jobs=1 + ) + + assert np.isclose(df_robust['y_slope_x1_robust'].iloc[0], 2.0, atol=0.5) + assert np.isclose(df_robust['y_slope_x2_robust'].iloc[0], 3.0, atol=0.5) + + +def test_exact_coefficient_recovery(): + np.random.seed(0) + x1 = np.random.uniform(0, 1, 100) + x2 = np.random.uniform(10, 20, 100) + df = pd.DataFrame({ + 'group': ['G1'] * 100, + 'x1': x1, + 'x2': x2, + }) + df['y'] = 2.0 * df['x1'] + 3.0 * df['x2'] + df['weight'] = 1.0 + + df_out, dfGB = GroupByRegressor.make_linear_fit( + df, + gb_columns=['group'], + fit_columns=['y'], + linear_columns=['x1', 'x2'], + median_columns=['x1'], + suffix='_clean', + selection=(df['x1'] >= 0), + addPrediction=True + ) + + assert np.isclose(dfGB['y_slope_x1_clean'].iloc[0], 2.0, atol=1e-6) + assert np.isclose(dfGB['y_slope_x2_clean'].iloc[0], 3.0, atol=1e-6) + + +def test_exact_coefficient_recovery_parallel(): + np.random.seed(0) + x1 = np.random.uniform(0, 1, 100) + x2 = np.random.uniform(10, 20, 100) + df = pd.DataFrame({ + 'group': ['G1'] * 100, + 'x1': x1, + 'x2': x2, + }) + df['y'] = 2.0 * df['x1'] + 3.0 * df['x2'] + df['weight'] = 1.0 + + df_out, dfGB = GroupByRegressor.make_parallel_fit( + df, + gb_columns=['group'], + fit_columns=['y'], + linear_columns=['x1', 'x2'], + median_columns=['x1'], + weights='weight', + suffix='_par', + selection=(df['x1'] >= 0), + addPrediction=True, + n_jobs=1, + min_stat=[1, 1] + ) + + assert np.isclose(dfGB['y_slope_x1_par'].iloc[0], 2.0, atol=1e-6) + assert np.isclose(dfGB['y_slope_x2_par'].iloc[0], 3.0, atol=1e-6) + + +def test_min_stat_per_predictor(): + # Create a group with 20 rows total, but only 5 valid for x2 + df = pd.DataFrame({ + 'group': ['G1'] * 20, + 'x1': np.linspace(0, 1, 20), + 'x2': [np.nan] * 15 + list(np.linspace(0, 1, 5)), + }) + df['y'] = 2.0 * df['x1'] + 3.0 * np.nan_to_num(df['x2']) + np.random.normal(0, 0.01, 20) + df['weight'] = 1.0 + + # Use all 20 rows, but let selection ensure only valid ones go into each predictor fit + selection = df['x1'].notna() & df['y'].notna() + + df_out, dfGB = GroupByRegressor.make_parallel_fit( + df, + gb_columns=['group'], + fit_columns=['y'], + linear_columns=['x1', 'x2'], + median_columns=['x1'], + weights='weight', + suffix='_minstat', + selection=selection, + addPrediction=True, + min_stat=[10, 10], # x1: 20 valid rows; x2: only 5 + n_jobs=1 + ) + + assert 'y_slope_x1_minstat' in dfGB.columns + assert not np.isnan(dfGB['y_slope_x1_minstat'].iloc[0]) # x1 passed + assert 'y_slope_x2_minstat' not in dfGB.columns or np.isnan(dfGB['y_slope_x2_minstat'].iloc[0]) # x2 skipped +def test_sigma_cut_impact(): + np.random.seed(0) + n_samples = 10000 + df = pd.DataFrame({ + 'group': ['G1'] * n_samples, + 'x1': np.linspace(0, 1, n_samples), + }) + df['y'] = 3.0 * df['x1'] + np.random.normal(0, 0.1, size=n_samples) + df.loc[::50, 'y'] += 100 # Insert strong outliers every 50th sample + df['weight'] = 1.0 + selection = df['x1'].notna() & df['y'].notna() + + _, dfGB_all = GroupByRegressor.make_parallel_fit( + df, ['group'], ['y'], ['x1'], ['x1'], 'weight', '_s100', + selection=selection, sigmaCut=100, n_jobs=1, addPrediction=True + ) + + _, dfGB_strict = GroupByRegressor.make_parallel_fit( + df, ['group'], ['y'], ['x1'], ['x1'], 'weight', '_s2', + selection=selection, sigmaCut=3, n_jobs=1, addPrediction=True + ) + + slope_all = dfGB_all['y_slope_x1_s100'].iloc[0] + slope_strict = dfGB_strict['y_slope_x1_s2'].iloc[0] + + assert abs(slope_strict - 3.0) < abs(slope_all - 3.0), \ + f"Robust fit with sigmaCut=2 should be closer to truth: slope_strict={slope_strict}, slope_all={slope_all}" + + + +def test_make_parallel_fit_robust(sample_data): + df = sample_data.copy() + df_out, dfGB = GroupByRegressor.make_parallel_fit( + df, + gb_columns=['group'], + fit_columns=['y'], + linear_columns=['x1', 'x2'], + median_columns=['x1'], + weights='weight', + suffix='_rob', + selection=(df['x1'] > -10), + addPrediction=True, + n_jobs=1, + min_stat=[5, 5], + fitter="robust" + ) + assert not dfGB.empty + assert 'y_rob' in df_out.columns + assert 'y_slope_x1_rob' in dfGB.columns + assert 'y_intercept_rob' in dfGB.columns + + +def test_make_parallel_fit_with_linear_regression(sample_data): + df = sample_data.copy() + df_out, dfGB = GroupByRegressor.make_parallel_fit( + df, + gb_columns=['group'], + fit_columns=['y'], + linear_columns=['x1', 'x2'], + median_columns=['x1'], + weights='weight', + suffix='_ols', + selection=(df['x1'] > -10), + addPrediction=True, + n_jobs=1, + min_stat=[5, 5], + fitter="ols" + ) + assert not dfGB.empty + assert 'y_ols' in df_out.columns + assert 'y_slope_x1_ols' in dfGB.columns + assert 'y_intercept_ols' in dfGB.columns + +def test_make_parallel_fit_with_custom_fitter(sample_data): + class DummyFitter: + def fit(self, X, y, sample_weight=None): + self.coef_ = np.zeros(X.shape[1]) + self.intercept_ = 42 + return self + + def predict(self, X): + return np.full(X.shape[0], self.intercept_) + + df = sample_data.copy() + df_out, dfGB = GroupByRegressor.make_parallel_fit( + df, + gb_columns=['group'], + fit_columns=['y'], + linear_columns=['x1'], + median_columns=['x1'], + weights='weight', + suffix='_dummy', + selection=(df['x1'] > -10), + addPrediction=True, + n_jobs=1, + min_stat=[5], + fitter=DummyFitter + ) + predicted = df_out['y_dummy'].dropna() + assert not predicted.empty + assert np.allclose(predicted.unique(), 42) + assert 'y_slope_x1_dummy' in dfGB.columns + assert dfGB['y_slope_x1_dummy'].iloc[0] == 0 + assert dfGB['y_intercept_dummy'].iloc[0] == 42 + + +def _make_groups(n_rows, n_groups, seed=0): + rng = np.random.default_rng(seed) + base = np.repeat(np.arange(n_groups, dtype=np.int32), n_rows // n_groups) + rem = n_rows - base.size + if rem > 0: + base = np.concatenate([base, rng.choice(n_groups, size=rem, replace=False)]) + rng.shuffle(base) + return base + +def _create_clean(n_rows=1000, n_groups=200, seed=0): + rng = np.random.default_rng(seed) + g = _make_groups(n_rows, n_groups, seed) + x = rng.normal(size=(n_rows, 2)).astype(np.float32) + y = (2*x[:,0] + 3*x[:,1] + rng.normal(0,1.0,size=n_rows)).astype(np.float32) + df = pd.DataFrame({"group": g, "x1": x[:,0], "x2": x[:,1], "y": y}) + df["group2"] = df["group"] + df["weight"] = 1.0 + return df + +def test_diagnostics_columns_present(): + df = _create_clean() + sel = pd.Series(True, index=df.index) + _, dfGB = GroupByRegressor.make_parallel_fit( + df, + gb_columns=["group", "group2"], + fit_columns=["y"], + linear_columns=["x1", "x2"], + median_columns=[], + weights="weight", + suffix="_fit", + selection=sel, + addPrediction=False, + n_jobs=1, + min_stat=[3, 4], + sigmaCut=5, + fitter="ols", + batch_size="auto", + diag=True, # <-- exercise diagnostics + diag_prefix="diag_", + ) + # Change the expected column names to include the suffix + suffix = "_fit" # <-- Add this line for clarity + cols = [ + f"diag_n_refits{suffix}", f"diag_frac_rejected{suffix}", f"diag_hat_max{suffix}", + f"diag_cond_xtx{suffix}", f"diag_time_ms{suffix}", f"diag_n_rows{suffix}", + ] + + for c in cols: + assert c in dfGB.columns, f"missing diagnostic column {c}" + # The original un-suffixed assertion: assert (dfGB["diag_n_refits"] >= 0).all() + # must also be updated to: + assert (dfGB[f"diag_n_refits{suffix}"] >= 0).all() diff --git a/UTILS/perf_log.txt b/UTILS/perf_log.txt new file mode 100644 index 000000000..36ed73bee --- /dev/null +++ b/UTILS/perf_log.txt @@ -0,0 +1,20 @@ +2025-05-31 18:42:55,604 | setup::start | 0.00 | 0.22 | miranov25 | Marians-MBP-3.fritz.box +2025-05-31 18:42:55,706 | loop::step[0] | 0.10 | 0.22 | miranov25 | Marians-MBP-3.fritz.box +2025-05-31 18:42:55,909 | loop::step[1] | 0.30 | 0.22 | miranov25 | Marians-MBP-3.fritz.box +2025-05-31 18:42:56,210 | loop::step[2] | 0.61 | 0.22 | miranov25 | Marians-MBP-3.fritz.box +2025-05-31 18:43:11,924 | setup::start | 0.00 | 0.13 | miranov25 | Marians-MBP-3.fritz.box +2025-05-31 18:43:12,026 | loop::step[0] | 0.10 | 0.13 | miranov25 | Marians-MBP-3.fritz.box +2025-05-31 18:43:12,231 | loop::step[1] | 0.31 | 0.13 | miranov25 | Marians-MBP-3.fritz.box +2025-05-31 18:43:12,537 | loop::step[2] | 0.61 | 0.13 | miranov25 | Marians-MBP-3.fritz.box +2025-05-31 19:17:53,659 | setup::start | 0.00 | 0.13 | miranov25 | Marians-MBP-3.fritz.box +2025-05-31 19:17:53,764 | loop::step[0] | 0.11 | 0.13 | miranov25 | Marians-MBP-3.fritz.box +2025-05-31 19:17:53,970 | loop::step[1] | 0.31 | 0.13 | miranov25 | Marians-MBP-3.fritz.box +2025-05-31 19:17:54,274 | loop::step[2] | 0.61 | 0.13 | miranov25 | Marians-MBP-3.fritz.box +2025-05-31 19:17:56,137 | setup::start | 0.00 | 0.13 | miranov25 | Marians-MBP-3.fritz.box +2025-05-31 19:17:56,238 | loop::step[0] | 0.10 | 0.13 | miranov25 | Marians-MBP-3.fritz.box +2025-05-31 19:17:56,444 | loop::step[1] | 0.31 | 0.13 | miranov25 | Marians-MBP-3.fritz.box +2025-05-31 19:17:56,750 | loop::step[2] | 0.61 | 0.13 | miranov25 | Marians-MBP-3.fritz.box +2025-05-31 19:19:43,683 | setup::start | 0.00 | 0.13 | miranov25 | Marians-MBP-3.fritz.box +2025-05-31 19:19:43,787 | loop::step[0] | 0.10 | 0.13 | miranov25 | Marians-MBP-3.fritz.box +2025-05-31 19:19:43,993 | loop::step[1] | 0.31 | 0.13 | miranov25 | Marians-MBP-3.fritz.box +2025-05-31 19:19:44,295 | loop::step[2] | 0.61 | 0.13 | miranov25 | Marians-MBP-3.fritz.box diff --git a/UTILS/perf_plots.pdf b/UTILS/perf_plots.pdf new file mode 100644 index 000000000..7ffb05bc0 Binary files /dev/null and b/UTILS/perf_plots.pdf differ diff --git a/UTILS/perfmonitor/README.md b/UTILS/perfmonitor/README.md new file mode 100644 index 000000000..f7c959048 --- /dev/null +++ b/UTILS/perfmonitor/README.md @@ -0,0 +1,158 @@ +# Performance Monitor + +Lightweight logging and analysis utility for tracking performance (execution time and memory) of scripts or processing pipelines. + +## Features + +* Logs elapsed time and memory (RSS) per step +* Supports multi-level index tags for loop tracking +* Saves logs in delimiter-separated format (default: `|`) +* Parses logs to `pandas.DataFrame` for analysis +* Summarizes stats (mean, max, min) with configurable grouping +* Plots memory/time using `matplotlib` +* Optionally saves plots to a PDF +* Combines logs from multiple files + +## Installation + +This is a self-contained utility. Just place the `perfmonitor/` directory into your Python path. + +## Example Usage + +```python +import time +import pandas as pd +import matplotlib.pyplot as plt +from perfmonitor import PerformanceLogger, default_plot_config, default_summary_config + +# Initialize logger +logger = PerformanceLogger("perf_log.txt") +logger.log("setup::start") + +# Simulate steps with increasing delays +for i, delay in enumerate([0.1, 0.2, 0.3]): + time.sleep(delay) + logger.log("loop::step", index=[i]) + +# Parse logs from one or more files +df = PerformanceLogger.log_to_dataframe(["perf_log.txt"]) +print(df.head()) +``` + +### Expected Output + +Example output from `print(df.head())`: + +``` + timestamp step elapsed_sec rss_gb user host logfile index_0 +0 2025-05-31 09:12:01,120 setup::start 0.00 0.13 user123 host.local perf_log.txt NaN +1 2025-05-31 09:12:01,220 loop::step[0] 0.10 0.14 user123 host.local perf_log.txt 0.0 +2 2025-05-31 09:12:01,420 loop::step[1] 0.20 0.15 user123 host.local perf_log.txt 1.0 +3 2025-05-31 09:12:01,720 loop::step[2] 0.30 0.15 user123 host.local perf_log.txt 2.0 +``` + +## Summary Statistics + +```python +summary = PerformanceLogger.summarize_with_config(df, default_summary_config) +print(summary) +``` + +### Example Summary Output + +``` +Out[5]: +{'summary_by_step': elapsed_sec rss_gb + mean max min count mean max min count + step + loop::step 0.34 0.61 0.1 15 0.148 0.22 0.13 15 + setup::start 0.00 0.00 0.0 5 0.148 0.22 0.13 5, + 'summary_by_step_and_index': elapsed_sec rss_gb + mean max min count mean max min count + step index_0 + loop::step 0.0 0.102 0.11 0.10 5 0.148 0.22 0.13 5 + 1.0 0.308 0.31 0.30 5 0.148 0.22 0.13 5 + 2.0 0.610 0.61 0.61 5 0.148 0.22 0.13 5} +``` + +## Plotting + +```python +# Show plots +PerformanceLogger.plot(df, default_plot_config) + +# Save plots to PDF +PerformanceLogger.plot(df, default_plot_config, output_pdf="perf_plots.pdf") +``` + +## Multi-Level Index Extraction + +Step IDs can include index metadata like: + +``` +load::data[1,2] +``` + +This will be automatically parsed into new DataFrame columns: + +* `index_0` → 1 +* `index_1` → 2 + +## Advanced: Custom Configuration +can be obtained modyfying the `default_plot_config` and `default_summary_config` dictionaries. +and invoking the `PerformanceLogger.plot` and `PerformanceLogger.summarize_with_config` with that configs + +PerformanceLogger.plot(df, default_plot_config, output_pdf="perf_plots.pdf") + +```python +default_plot_config={ + "RSS vs Time": { + "kind": "line", + "varX": "timestamp", + "varY": "rss_gb", + "title": "RSS over Time", + "sort": "timestamp" + }, + "RSS vs Step (chronological)": { + "kind": "line", + "varX": "rowID", + "varY": "rss_gb", + "title": "RSS vs Step", + "xlabel": "step", + "xticklabels": "step", + "sort": "rowID" + }, + "Elapsed Time vs Step": { + "kind": "bar", + "varX": "step", + "varY": "elapsed_sec", + "title": "Elapsed Time per Step", + "sort": None + }, + "RSS Summary Stats": { + "varX": "step", + "varY": "rss_gb", + "aggregation": ["mean", "median", "std"], + "title": "RSS Summary Statistics", + "xlabel": "Step", + "ylabel": "RSS (GB)", + "sort": "step" + } + +} + +default_summary_config={ + "summary_by_step": { + "by": ["step"], + "stats": ["mean", "max", "min", "count"] + }, + "summary_by_step_and_index": { + "by": ["step", "index_0"], + "stats": ["mean", "max", "min", "count"] + } +} +``` + + +## License +??? diff --git a/UTILS/perfmonitor/__init__.py b/UTILS/perfmonitor/__init__.py new file mode 100644 index 000000000..f9876a462 --- /dev/null +++ b/UTILS/perfmonitor/__init__.py @@ -0,0 +1,13 @@ +# perfmonitor/__init__.py + +from .performance_logger import ( + PerformanceLogger, + default_plot_config, + default_summary_config +) + +__all__ = [ + "PerformanceLogger", + "default_plot_config", + "default_summary_config" +] diff --git a/UTILS/perfmonitor/perf_log.txt b/UTILS/perfmonitor/perf_log.txt new file mode 100644 index 000000000..e69de29bb diff --git a/UTILS/perfmonitor/performance_logger.py b/UTILS/perfmonitor/performance_logger.py new file mode 100644 index 000000000..58035093e --- /dev/null +++ b/UTILS/perfmonitor/performance_logger.py @@ -0,0 +1,209 @@ +import time +import psutil +import socket +import getpass +import pandas as pd +import matplotlib.pyplot as plt +from typing import Union, List, Dict, Optional +import sys + +class PerformanceLogger: + def __init__(self, log_path: str, sep: str = "|"): + self.log_path = log_path + self.start_time = time.time() + self.sep = sep + self.user = getpass.getuser() + self.host = socket.gethostname() + + def log(self, step: str, index: Optional[List[int]] = None): + elapsed = time.time() - self.start_time + mem_gb = psutil.Process().memory_info().rss / (1024 ** 3) + index_str = "" if index is None else f"[{','.join(map(str, index))}]" + step_full = f"{step}{index_str}" + line = f"{time.strftime('%Y-%m-%d %H:%M:%S')},{int(time.time() * 1000) % 1000:03d} {self.sep} {step_full} {self.sep} {elapsed:.2f} {self.sep} {mem_gb:.2f} {self.sep} {self.user} {self.sep} {self.host}\n" + with open(self.log_path, "a") as f: + f.write(line) + print(f"{step_full} | {elapsed:.2f} | {mem_gb:.2f} | {self.user} | {self.host}") + + + @staticmethod + def log_to_dataframe(log_paths: Union[str, List[str]], sep: str = "|") -> pd.DataFrame: + if isinstance(log_paths, str): + log_paths = [log_paths] + + rows = [] + for log_id, path in enumerate(log_paths): + try: + with open(path) as f: + for row_id, line in enumerate(f): + parts = [x.strip() for x in line.strip().split(sep)] + if len(parts) < 5: + continue + timestamp, step, elapsed_str, rss_str, user, host = parts[:6] + row = { + "timestamp": timestamp, + "step": step, + "elapsed_sec": float(elapsed_str), + "rss_gb": float(rss_str), + "user": user, + "host": host, + "logfile": path, + "rowID": row_id, + "logID": log_id + } + + if "[" in step and "]" in step: + base, idx = step.split("[") + row["step"] = base + idx = idx.rstrip("]") + for i, val in enumerate(idx.split(",")): + if val.strip().isdigit(): + row[f"index_{i}"] = int(val.strip()) + rows.append(row) + except FileNotFoundError: + continue + + return pd.DataFrame(rows) + + @staticmethod + def summarize_with_config(df: pd.DataFrame, config: Dict) -> pd.DataFrame: + group_cols = config.get("by", ["step"]) + stats = config.get("stats", ["mean", "max", "min"]) + agg = {} + for col in ["elapsed_sec", "rss_gb"]: + agg[col] = stats + return df.groupby(group_cols).agg(agg) + @staticmethod + def summarize_with_configs(df: pd.DataFrame, config_dict: Dict[str, Dict]) -> Dict[str, pd.DataFrame]: + summaries = {} + for name, config in config_dict.items(): + summaries[name] = PerformanceLogger.summarize_with_config(df, config) + return summaries + + @staticmethod + def plot(df: pd.DataFrame, + config_dict: Dict[str, Dict], + filter_expr: Optional[str] = None, + output_pdf: Optional[str] = None): + + if filter_expr: + df = df.query(filter_expr) + + if output_pdf: + from matplotlib.backends.backend_pdf import PdfPages + pdf = PdfPages(output_pdf) + + for name, config in config_dict.items(): + subdf = df.copy() + if "filter" in config: + subdf = subdf.query(config["filter"]) + + varX = config.get("varX", "timestamp") + varY = config.get("varY", "elapsed_sec") + aggregation = config.get("aggregation") + xlabel = config.get("xlabel", varX) + ylabel = config.get("ylabel", varY) + + if aggregation: + if isinstance(aggregation, list): + agg_df = subdf.groupby(varX)[varY].agg(aggregation) + subdf = agg_df.reset_index() + else: + subdf = subdf.groupby(varX)[varY].agg(aggregation).reset_index() + + sort_column = config.get("sort") + if sort_column: + subdf = subdf.sort_values(sort_column) + + plt.figure() + + if aggregation and isinstance(aggregation, list): + for stat in aggregation: + plt.plot(subdf[varX], subdf[stat], marker="o", label=stat) + plt.legend() + else: + y = subdf[varY] + kind = config.get("kind", "line") + if kind == "line": + plt.plot(subdf[varX], y, marker="o") + elif kind == "bar": + plt.bar(subdf[varX], y) + else: + raise ValueError(f"Unsupported plot kind: {kind}") + + if "xticklabels" in config: + plt.xticks(ticks=subdf[varX], labels=subdf[config["xticklabels"]], rotation=45) + + plt.title(config.get("title", name)) + plt.xlabel(xlabel) + plt.ylabel(ylabel) + plt.tight_layout() + is_testing = "pytest" in sys.modules + if output_pdf: + pdf.savefig() + plt.close() + elif not is_testing: + plt.show() + + if output_pdf: + pdf.close() + + + + +# Default configurations + +default_plot_config={ + "RSS vs Time": { + "kind": "line", + "varX": "timestamp", + "varY": "rss_gb", + "title": "RSS over Time", + "sort": "timestamp" + }, + "RSS vs Step (chronological)": { + "kind": "line", + "varX": "rowID", + "varY": "rss_gb", + "title": "RSS vs Step", + "xlabel": "step", + "xticklabels": "step", + "sort": "rowID" + }, + "Elapsed Time vs Step": { + "kind": "bar", + "varX": "step", + "varY": "elapsed_sec", + "title": "Elapsed Time per Step", + "sort": None + }, + "RSS Summary Stats": { + "varX": "step", + "varY": "rss_gb", + "aggregation": ["mean", "median", "std"], + "title": "RSS Summary Statistics", + "xlabel": "Step", + "ylabel": "RSS (GB)", + "sort": "step" + }, + "Elapsed Time Summary Stats": { + "varX": "step", + "varY": "elapsed_sec", + "aggregation": ["mean", "median", "std"], + "title": "Elapsed Time Summary Statistics", + "xlabel": "Step", + "ylabel": "Elapsed Time (s)", + "sort": "step" + }, +} + +default_summary_config={ + "summary_by_step": { + "by": ["step"], + "stats": ["mean", "max", "min", "count"] + }, + "summary_by_step_and_index": { + "by": ["step", "index_0"], + "stats": ["mean", "max", "min", "count"] + } +} \ No newline at end of file diff --git a/UTILS/perfmonitor/test_performance_logger.py b/UTILS/perfmonitor/test_performance_logger.py new file mode 100644 index 000000000..cd75c30fb --- /dev/null +++ b/UTILS/perfmonitor/test_performance_logger.py @@ -0,0 +1,93 @@ +import time +import tempfile +import os +import pytest +import pandas as pd +from perfmonitor.performance_logger import ( + PerformanceLogger, + default_summary_config, + default_plot_config, +) + +def test_basic_logging_and_parsing(): + with tempfile.NamedTemporaryFile(delete=False, mode='w+', suffix=".txt") as tmp: + log_path = tmp.name + + logger = PerformanceLogger(log_path) + logger.log("start") + time.sleep(0.1) + logger.log("step::loop", index=[0]) + time.sleep(0.1) + logger.log("step::loop", index=[1, 2]) + + df = PerformanceLogger.log_to_dataframe([log_path]) + assert not df.empty + assert "step" in df.columns + assert "elapsed_sec" in df.columns + assert "rss_gb" in df.columns + assert df["step"].str.contains("step::loop").any() + assert "index_1" in df.columns # tests index parsing + + os.remove(log_path) + + +def test_missing_log_file_handling(): + df = PerformanceLogger.log_to_dataframe(["nonexistent_file.txt"]) + assert isinstance(df, pd.DataFrame) + assert df.empty + + +def test_plot_and_summary(tmp_path): + log_path = tmp_path / "log.txt" + logger = PerformanceLogger(log_path) + logger.log("init") + time.sleep(0.05) + for i in range(3): + logger.log("step::loop", index=[i]) + time.sleep(0.01) + + df = PerformanceLogger.log_to_dataframe([str(log_path)]) + + summary = PerformanceLogger.summarize_with_config(df, default_summary_config) + assert isinstance(summary, dict) + assert "summary_by_step" in summary + + # Test plotting (non-crashing) + PerformanceLogger.plot(df, default_plot_config) + + +def test_multiple_files(): + paths = [] + for i in range(2): + with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as tmp: + path = tmp.name + logger = PerformanceLogger(path) + logger.log(f"file{i}::start") + paths.append(path) + + df = PerformanceLogger.log_to_dataframe(paths) + assert len(df) == 2 + assert "logfile" in df.columns + for path in paths: + os.remove(path) + + +def test_custom_summary(): + with tempfile.NamedTemporaryFile(delete=False) as tmp: + log_path = tmp.name + + logger = PerformanceLogger(log_path) + for i in range(3): + logger.log("step::measure", index=[i]) + time.sleep(0.01) + + df = PerformanceLogger.log_to_dataframe([log_path]) + config = { + "by_index": { + "by": ["index_0"], + "stats": ["mean", "count"] + } + } + summary = PerformanceLogger.summarize_with_config(df, config) + assert "by_index" in summary + os.remove(log_path) diff --git a/UTILS/setup.py b/UTILS/setup.py new file mode 100644 index 000000000..58613d642 --- /dev/null +++ b/UTILS/setup.py @@ -0,0 +1,10 @@ +# File: /Users/miranov25/alicesw/O2DPG/UTILS/setup.py +from setuptools import setup, find_packages + +setup( + name="utils", + version="0.1", + packages=find_packages(), # This will include perfmonitor and others +) + + diff --git a/UTILS/utils.egg-info/PKG-INFO b/UTILS/utils.egg-info/PKG-INFO new file mode 100644 index 000000000..01c864fc2 --- /dev/null +++ b/UTILS/utils.egg-info/PKG-INFO @@ -0,0 +1,9 @@ +Metadata-Version: 2.1 +Name: utils +Version: 0.1 +Summary: UNKNOWN +License: UNKNOWN +Platform: UNKNOWN + +UNKNOWN + diff --git a/UTILS/utils.egg-info/SOURCES.txt b/UTILS/utils.egg-info/SOURCES.txt new file mode 100644 index 000000000..4d669ee72 --- /dev/null +++ b/UTILS/utils.egg-info/SOURCES.txt @@ -0,0 +1,15 @@ +README.md +setup.py +dfextensions/AliasDataFrame.py +dfextensions/AliasDataFrameTest.py +dfextensions/DataFrameUtils.py +dfextensions/FormulaLinearModel.py +dfextensions/__init__.py +dfextensions/groupby_regression.py +dfextensions/test_groupby_regression.py +perfmonitor/__init__.py +perfmonitor/performance_logger.py +utils.egg-info/PKG-INFO +utils.egg-info/SOURCES.txt +utils.egg-info/dependency_links.txt +utils.egg-info/top_level.txt \ No newline at end of file diff --git a/UTILS/utils.egg-info/dependency_links.txt b/UTILS/utils.egg-info/dependency_links.txt new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/UTILS/utils.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/UTILS/utils.egg-info/top_level.txt b/UTILS/utils.egg-info/top_level.txt new file mode 100644 index 000000000..dcd35a7ea --- /dev/null +++ b/UTILS/utils.egg-info/top_level.txt @@ -0,0 +1,2 @@ +dfextensions +perfmonitor