Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 93 additions & 21 deletions src/anonymeter/evaluators/inference_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,16 @@


def _run_attack(
target: pd.DataFrame,
syn: pd.DataFrame,
n_attacks: int,
aux_cols: list[str],
secret: str,
n_jobs: int,
naive: bool,
regression: Optional[bool],
inference_model: Optional[InferencePredictor],
) -> int:
target: pd.DataFrame,
syn: pd.DataFrame,
n_attacks: int,
aux_cols: list[str],
secret: str,
n_jobs: int,
naive: bool,
regression: Optional[bool],
inference_model: Optional[InferencePredictor],
) -> tuple[int, pd.Series]:
if regression is None:
regression = pd.api.types.is_numeric_dtype(target[secret])

Expand All @@ -35,9 +35,11 @@ def _run_attack(
# Instantiate the default KNN model if no other model is passed through `inference_model`.
if inference_model is None:
inference_model = KNNInferencePredictor(data=syn, columns=aux_cols, target_col=secret, n_jobs=n_jobs)

guesses = inference_model.predict(targets)
guesses = guesses.reindex_like(targets)

return evaluate_inference_guesses(guesses=guesses, secrets=targets[secret], regression=regression).sum()
return evaluate_inference_guesses(guesses=guesses, secrets=targets[secret], regression=regression).sum(), guesses


def evaluate_inference_guesses(
Expand Down Expand Up @@ -72,6 +74,9 @@ def evaluate_inference_guesses(
Array of boolean values indicating the correcteness of each guess.

"""
if not guesses.index.equals(secrets.index):
raise RuntimeError("The predictions indices do not match the target indices. Check your inference model.")

guesses_np = guesses.to_numpy()
secrets_np = secrets.to_numpy()

Expand Down Expand Up @@ -152,7 +157,7 @@ def __init__(
syn: pd.DataFrame,
aux_cols: list[str],
secret: str,
regression: Optional[bool] = None,
regression: bool = False,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💯

n_attacks: int = 500,
control: Optional[pd.DataFrame] = None,
inference_model: Optional[InferencePredictor] = None
Expand Down Expand Up @@ -180,7 +185,7 @@ def __init__(
self._aux_cols = aux_cols
self._evaluated = False

def _attack(self, target: pd.DataFrame, naive: bool, n_jobs: int, n_attacks: int) -> int:
def _attack(self, target: pd.DataFrame, naive: bool, n_jobs: int, n_attacks: int) -> tuple[int, pd.Series]:
return _run_attack(
target=target,
syn=self._syn,
Expand All @@ -207,14 +212,17 @@ def evaluate(self, n_jobs: int = -2) -> "InferenceEvaluator":
The evaluated ``InferenceEvaluator`` object.

"""
self._n_baseline = self._attack(target=self._ori, naive=True, n_jobs=n_jobs,
n_attacks=self._n_attacks_baseline)
self._n_success = self._attack(target=self._ori, naive=False, n_jobs=n_jobs,
n_attacks=self._n_attacks_ori)
self._n_control = (
None if self._control is None else self._attack(target=self._control, naive=False, n_jobs=n_jobs,
n_attacks=self._n_attacks_control)
)
self._n_baseline, self._guesses_baseline = self._attack(
target=self._ori, naive=True, n_jobs=n_jobs, n_attacks=self._n_attacks_baseline
)
self._n_success, self._guesses_success = self._attack(
target=self._ori, naive=False, n_jobs=n_jobs, n_attacks=self._n_attacks_ori
)
self._n_control, self._guesses_control = (
(None, None)
if self._control is None
else self._attack(target=self._control, naive=False, n_jobs=n_jobs, n_attacks=self._n_attacks_control)
)

self._evaluated = True
return self
Expand Down Expand Up @@ -269,3 +277,67 @@ def risk(self, confidence_level: float = 0.95, baseline: bool = False) -> Privac
"""
results = self.results(confidence_level=confidence_level)
return results.risk(baseline=baseline)

def risk_for_groups(self, confidence_level: float = 0.95) -> dict[str, EvaluationResults]:
"""Compute the inference risk for each group of targets with the same value of the secret attribute.

Parameters
----------
confidence_level : float, default is 0.95
Confidence level for the error bound calculation.

Returns
-------
dict[str, tuple[EvaluationResults | PrivacyRisk]
The group as a key, and then for every group the results (EvaluationResults),
and the risks (PrivacyRisk) as a tuple.
Comment on lines +291 to +293
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why returning both the PrivacyRisk and the EvaluationResults? It would be better to just pick one of the two:

  • returning dict[str, EvaluationResults] is the more flexible choice, since the PrivacyRisk can be computed later.
  • returning dict[str, PrivacyRisk] is instead simpler and more high-level. Will the user need the EvaluationResults for anything other than computing the risk?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

otherwise, you can mirror the non-grouped case and have two separate methods, one called grouped_results (which returns dict[str, EvaluationResults]) and the other called grouped_risk returning dict[str, PrivacyRisk]. This is probably the more complete option.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right. I removed the risk for now. In our use case, it was almost always relevant to know the attack success rate. I'm returning the EvaluationResults, and if necessary one can call the risk() afterwards


"""
if not self._evaluated:
raise RuntimeError("The inference evaluator wasn't evaluated yet. Please, run `evaluate()` first.")

all_results = {}

# For every unique group in `self._secret`
for group, data_ori in self._ori.groupby(self._secret):
# Get the targets for the current group
common_indices = data_ori.index.intersection(self._guesses_success.index)
# Get the guesses for the current group
target_group = data_ori.loc[common_indices]
n_attacks_ori = len(target_group)

# Count the number of success attacks
n_success = evaluate_inference_guesses(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

re the other comment: here you are not checking for consistent indexes. That would come for free if it's moved in evaluate_inference_guesses

guesses=self._guesses_success.loc[common_indices],
secrets=target_group[self._secret],
regression=self._regression,
).sum()

if self._control is not None:
# Get the targets for the current control group
data_control = self._control[self._control[self._secret] == group]
n_attacks_control = len(data_control)

# Get the guesses for the current control group
common_indices = data_control.index.intersection(self._guesses_control.index)

# Count the number of success control attacks
n_control = evaluate_inference_guesses(
guesses=self._guesses_control.loc[common_indices],
secrets=data_control[self._secret],
regression=self._regression,
).sum()
else:
n_control = None
n_attacks_control = -1

# Recreate the EvaluationResults for the current group
all_results[group] = EvaluationResults(
n_attacks=(n_attacks_ori, self._n_attacks_baseline, n_attacks_control),
n_success=n_success,
n_baseline=self._n_baseline, # The baseline risk should be the same independent of the group
n_control=n_control,
confidence_level=confidence_level,
)

return all_results
7 changes: 5 additions & 2 deletions tests/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
TEST_DIR_PATH = os.path.dirname(os.path.realpath(__file__))


def get_adult(which: str, n_samples: Optional[int] = None) -> pd.DataFrame:
def get_adult(which: str, deduplicate_on: Optional[list[str]] = None, n_samples: Optional[int] = None) -> pd.DataFrame:
"""Fixture for the adult dataset.

For details see:
Expand All @@ -21,6 +21,8 @@ def get_adult(which: str, n_samples: Optional[int] = None) -> pd.DataFrame:
----------
which : str, in ['ori', 'syn']
Whether to return the "original" or "synthetic" samples.
deduplicate_on: list of str
A list of columns based on which we'd drop duplicates in the samples.
n_samples : int
Number of sample records to return.
If `None` - return all samples.
Expand All @@ -37,4 +39,5 @@ def get_adult(which: str, n_samples: Optional[int] = None) -> pd.DataFrame:
else:
raise ValueError(f"Invalid value {which} for parameter `which`. Available are: 'ori' or 'syn'.")

return pd.read_csv(os.path.join(TEST_DIR_PATH, "datasets", fname), nrows=n_samples)
samples = pd.read_csv(os.path.join(TEST_DIR_PATH, "datasets", fname), nrows=n_samples)
return samples.drop_duplicates(subset=deduplicate_on) if deduplicate_on else samples
45 changes: 42 additions & 3 deletions tests/test_inference_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,21 @@ def test_evaluate_inference_guesses_classification(guesses, secrets, expected):
np.testing.assert_equal(out, expected)


@pytest.mark.parametrize(
"guesses, secrets, expected",
[
(("a", "b"), ("a", "b"), (True, True)),
((np.nan, "b"), (np.nan, "b"), (True, True))
],
)
def test_evaluate_inference_guesses_secrets_indices(guesses, secrets, expected):
secrets = pd.Series(secrets).sort_index(ascending=False)
with pytest.raises(Exception) as runtime_error:
evaluate_inference_guesses(guesses=pd.Series(guesses), secrets=secrets, regression=False)
assert runtime_error.type is RuntimeError
assert "The predictions indices do not match the target indices" in str(runtime_error.value)


@pytest.mark.parametrize(
"guesses, secrets, expected",
[
Expand Down Expand Up @@ -103,8 +118,10 @@ def test_inference_evaluator_rates(
)
@pytest.mark.parametrize("secret", ["education", "marital", "capital_gain"])
def test_inference_evaluator_leaks(aux_cols, secret):
ori = get_adult("ori", n_samples=10)
evaluator = InferenceEvaluator(ori=ori, syn=ori, control=ori, aux_cols=aux_cols, secret=secret, n_attacks=10)
ori = get_adult("ori", deduplicate_on=aux_cols, n_samples=10)
evaluator = InferenceEvaluator(
ori=ori, syn=ori, control=ori, aux_cols=aux_cols, secret=secret, n_attacks=ori.shape[0]
)
evaluator.evaluate(n_jobs=1)
results = evaluator.results(confidence_level=0)

Expand All @@ -113,7 +130,7 @@ def test_inference_evaluator_leaks(aux_cols, secret):


def test_evaluator_not_evaluated():
df = get_adult("ori", n_samples=10)
df = get_adult("ori", deduplicate_on=None, n_samples=10)
evaluator = InferenceEvaluator(
ori=df,
syn=df,
Expand All @@ -123,3 +140,25 @@ def test_evaluator_not_evaluated():
)
with pytest.raises(RuntimeError):
evaluator.risk()


@pytest.mark.parametrize(
"aux_cols",
[
["type_employer", "capital_loss", "hr_per_week", "age"],
["education_num", "marital", "capital_loss"],
],
)
@pytest.mark.parametrize("secret", ["education", "marital"])
def test_inference_evaluator_group_wise(aux_cols, secret):
ori = get_adult("ori", deduplicate_on=aux_cols, n_samples=10)
evaluator = InferenceEvaluator(
ori=ori, syn=ori, control=ori, aux_cols=aux_cols, secret=secret, n_attacks=ori.shape[0]
)
evaluator.evaluate(n_jobs=1)

group_wise = evaluator.risk_for_groups(confidence_level=0)

for _, results in group_wise.items():
np.testing.assert_equal(results.attack_rate, (1, 0))
np.testing.assert_equal(results.control_rate, (1, 0))
4 changes: 2 additions & 2 deletions tests/test_mixed_types_kneigbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@


def test_mixed_type_kNN():
df = get_adult("ori", n_samples=10)
df = get_adult("ori", deduplicate_on=None, n_samples=10)
nn = MixedTypeKNeighbors().fit(df)
shuffled_idx = rng.integers(10, size=10)
dist, ids = nn.kneighbors(df.iloc[shuffled_idx], n_neighbors=1, return_distance=True)
Expand Down Expand Up @@ -43,7 +43,7 @@ def test_mixed_type_kNN_numerical_scaling():

@pytest.mark.parametrize("n_neighbors, n_queries", [(1, 10), (3, 5)])
def test_mixed_type_kNN_shape(n_neighbors, n_queries):
df = get_adult("ori", n_samples=10)
df = get_adult("ori", deduplicate_on=None, n_samples=10)
nn = MixedTypeKNeighbors(n_neighbors=n_neighbors).fit(df)
ids = nn.kneighbors(df.head(n_queries))
assert isinstance(ids, np.ndarray)
Expand Down
8 changes: 4 additions & 4 deletions tests/test_singling_out_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@

@pytest.mark.parametrize("mode", ["univariate", "multivariate"])
def test_so_general(mode: str) -> None:
ori = get_adult("ori", n_samples=10)
syn = get_adult("syn", n_samples=10)
ori = get_adult("ori", deduplicate_on=None, n_samples=10)
syn = get_adult("syn", deduplicate_on=None, n_samples=10)
soe = SinglingOutEvaluator(ori=ori, syn=syn, n_attacks=5).evaluate(mode=mode)

for q in soe.queries():
Expand Down Expand Up @@ -150,7 +150,7 @@ def test_singling_out_query_generator() -> None:
@pytest.mark.parametrize("confidence_level", [0.5, 0.68, 0.95, 0.99])
@pytest.mark.parametrize("mode", ["univariate", "multivariate"])
def test_singling_out_risk_estimate(confidence_level: float, mode: str) -> None:
ori = get_adult("ori", 10)
ori = get_adult("ori", deduplicate_on=None, n_samples=10)
soe = SinglingOutEvaluator(ori=ori, syn=ori, n_attacks=5)
soe.evaluate(mode=mode)
_, ci = soe.risk(confidence_level=confidence_level)
Expand All @@ -176,7 +176,7 @@ def _so_probability(n: int, w: float):

@pytest.mark.parametrize("max_attempts", [1, 2, 3])
def test_so_evaluator_max_attempts(max_attempts: int) -> None:
ori = get_adult("ori", 10)
ori = get_adult("ori", deduplicate_on=None, n_samples=10)
soe = SinglingOutEvaluator(ori=ori, syn=ori, n_attacks=10, max_attempts=max_attempts)
soe.evaluate(mode="multivariate")

Expand Down
2 changes: 1 addition & 1 deletion tests/test_sklearn_inference_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
)
@pytest.mark.parametrize("secret", ["capital_gain", "capital_loss"])
def test_inference_evaluator_custom_model_regressor(aux_cols, secret):
ori = get_adult("ori", n_samples=10)
ori = get_adult("ori", deduplicate_on=aux_cols, n_samples=10)

# Inference model prep
categorical_cols = ori[aux_cols].select_dtypes(include=["object"]).columns
Expand Down