Balance Calculators

[1]:
from pybalance.utils.balance_calculators import *
from pybalance.utils import MatchingData
from pybalance.sim import load_paper_dataset
[2]:
m =load_paper_dataset()
m
[2]:
Headers Numeric:
['age', 'height', 'weight']

Headers Categoric:
['gender', 'haircolor', 'country', 'binary_0', 'binary_1', 'binary_2', 'binary_3']

Populations
['pool', 'target']
age height weight gender haircolor country population binary_0 binary_1 binary_2 binary_3 patient_id
0 64.854093 189.466850 88.835049 1.0 1 4 pool 0 1 0 1 135740
1 52.571993 158.134940 94.215107 1.0 1 1 pool 0 1 0 1 49288
2 25.828361 154.692482 94.226222 1.0 0 3 pool 0 0 1 0 256676
3 70.177571 160.536632 94.244356 1.0 0 2 pool 0 0 0 1 338287
4 73.779164 153.551419 86.161814 0.0 0 1 pool 0 0 1 1 72849
... ... ... ... ... ... ... ... ... ... ... ... ...
274995 62.547794 186.005015 50.975051 0.0 0 1 target 0 0 1 1 579081
274996 69.879934 142.371386 100.138389 1.0 1 4 target 0 1 1 0 569939
274997 56.921402 130.639589 108.745182 1.0 1 5 target 0 1 0 0 532419
274998 34.082754 174.764051 67.998396 0.0 2 2 target 0 0 0 1 566266
274999 60.981259 137.419436 89.897817 1.0 0 5 target 1 1 1 1 544231

275000 rows × 12 columns

[13]:
m.counts()
[13]:
N
population
pool 250000
target 25000

Fit Balance Calculator

[14]:
# Balance calculators in general are "fit" to the whole population data
# Fitting here means fitting preprocessors (e.g. what bins to use when binning
# is involved). It's important to fit once so that all calls to distance()
# can be compared meaningfully.
beta = BetaBalance(m)
target, pool = split_target_pool(m)

Balance between pool and target

[15]:
beta.distance(pool)
[15]:
tensor(0.2353, dtype=torch.float64)
[16]:
# Specifying target is optional
beta.distance(pool, target)
[16]:
tensor(0.2353, dtype=torch.float64)

Balance between subset of pool and target

[17]:
beta.distance(pool.sample(n=100))
[17]:
tensor(0.2366, dtype=torch.float64)
[18]:
# Can also take subsets of the target
beta.distance(pool.sample(n=100), target.sample(n=100))
[18]:
tensor(0.2669, dtype=torch.float64)

Balance between several subsets simultaneously

[19]:
pool_subsets = np.array([
    np.random.choice(pool.reset_index().index.values, size=100, replace=False),
    np.random.choice(pool.reset_index().index.values, size=100, replace=False)
])
beta.distance(pool_subsets)
[19]:
tensor([0.2404, 0.2418], dtype=torch.float64)
[9]:
pool_subsets = [
    np.random.choice(pool.reset_index().index.values, size=100, replace=False),
    np.random.choice(pool.reset_index().index.values, size=100, replace=False)
]
target_subsets = [
    np.random.choice(target.reset_index().index.values, size=100, replace=False),
    np.random.choice(target.reset_index().index.values, size=100, replace=False)
]
beta.distance(pool_subsets, target_subsets)
/Users/gmema/src/pybalance/pybalance/utils/balance_calculators.py:224: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_new.cpp:278.)
  subset_populations = torch.tensor(
[9]:
tensor([0.2602, 0.2757], dtype=torch.float64)
[28]:
# Must have same number of subsets! This will throw an error:
pool_subsets = [
    np.random.choice(pool.reset_index().index.values, size=100, replace=False),
    np.random.choice(pool.reset_index().index.values, size=100, replace=False)
]
target_subsets = [
    np.random.choice(target.reset_index().index.values, size=100, replace=False),
    np.random.choice(target.reset_index().index.values, size=100, replace=False),
    np.random.choice(target.reset_index().index.values, size=100, replace=False)
]
try:
    beta.distance(pool_subsets, target_subsets)
except ValueError as e:
    print(e)
Number of subset populations must be same for pool and target!

Basic Genetic Optimizer

Here is a very basic, un-optimized implementation of genetic matching! It’s not very smart, because it doesn’t mix the good populations. This is just an illustration of using the balance calculator.

[27]:
def get_subsets(pool, target, pool_size, target_size, n_subsets):
    pool = pool.reset_index()
    target = target.reset_index()

    pool_subsets = [
        np.random.choice(pool.index.values, size=pool_size, replace=False) for _ in range(n_subsets)
    ]
    target_subsets = [
        np.random.choice(target.index.values, size=target_size, replace=False) for _ in range(n_subsets)
    ]
    return pool_subsets, target_subsets


pool_size = 1000
target_size = 1000
n_subsets = 100
best_match = None
best_distance = 100000
for j in range(100):
    pool_subsets, target_subsets = get_subsets(pool, target, pool_size, target_size, n_subsets)
    distances = beta.distance(pool_subsets, target_subsets)
    this_best_distance = distances.min()
    if this_best_distance < best_distance:
        best_distance = this_best_distance
        best_match_idx = distances.argmin()
        best_match = pool_subsets[best_match_idx], target_subsets[best_match_idx]

    if not j % 10:
        print(f'Generation {j} / Best distance found {best_distance:.3f}')
Generation 0 / Best distance found 0.215
Generation 10 / Best distance found 0.215
Generation 20 / Best distance found 0.211
Generation 30 / Best distance found 0.211
Generation 40 / Best distance found 0.208
Generation 50 / Best distance found 0.208
Generation 60 / Best distance found 0.208
Generation 70 / Best distance found 0.208
Generation 80 / Best distance found 0.208
Generation 90 / Best distance found 0.208
[ ]: