-
Notifications
You must be signed in to change notification settings - Fork 0
Open
Labels
Description
def test_col_distribution_mask(self):
col_probs = [.5, .3, .2]
cols = ['var1', 'var2', 'label']
rct = BaseRandomCellTransform([.0, 1.], cols, col_probs)
data = self.df.sample(10000, replace=True)
mask = rct._make_mask(data)
assert type(mask) == np.ndarray
assert mask.shape == (data.shape[0], len(cols))
# check if it is a proper one-hot encoding
assert mask.sum() == data.shape[0]
expected_counts = [5250, 3050, 1700]
threshold = .0001
# the counts do not make counts ideally to expected 5000, 3000, 2000
c, p = chisquare(mask.sum(0), expected_counts)
if p <= threshold:
print(f'Error. looks like the column distribution {mask.sum(0)} is too far from expected '
f'{expected_counts}')
> assert p > threshold
E assert 1.8048484428487904e-06 > 0.0001
the printed message was:
Error. looks like the column distribution [5506. 2872. 1622.] is too far from expected [5250, 3050, 1700]