Using mother in your ML project
Here, I want to give an overview on how mother can be used in a project scope. As mentioned earlier, multiple examples can be found in the examples folder. The beauty of the current mother implementation is, that every step can be performed individually (in case you have the appropriate input for the step). To achieve that, the code of each step is put into separate submodules. Examples for the most important submodules will be given below. A comprehensive overview can be found in the examples.
To faciliatate configuration of the ML project a MotherSettings class is created. In the following usage of that settings class (that is based on a simple yaml file) is shown. Project specific configuration files can be loaded as well. The example below shows the python code in the 'Source' tab and the result(print) in the 'Result' tab.
Example
import typing
from mother import pipeline_utils as mother_takes_care
from mother import ml
from mother.settings import MotherSettings
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn import set_config
import sklearn.model_selection as skl_model_sel
mother_settings: MotherSettings = MotherSettings.create()
model_steps: typing.Sequence[
typing.Tuple[str, typing.Union[Pipeline, FeatureUnion, ml.AbstractMotherPipeline, typing.Any]]
] = []
model_steps.append(("preprocessor", mother_takes_care.get_preprocessing_pipeline(settings=mother_settings)))
model_steps.append(("feature_generator", mother_takes_care.get_feature_generation_pipeline(settings=mother_settings)))
model_steps.append(("feature_selector", mother_takes_care.get_feature_selection_pipeline(settings=mother_settings, cv=skl_model_sel.GroupKFold(n_splits=5))))
model_steps.append(("model", mother_takes_care.get_model(settings=mother_settings)))
training_pipeline: Pipeline = Pipeline(steps=model_steps)
training_pipeline.set_output(transform="pandas")
print(training_pipeline)
# training_pipeline.fit(X_train, y_train)
#inference_pipeline = training_pipeline
# inference_pipeline.predict(["CCC"])
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('smiles_standardizer',
StandardizerTransformer(flags=['STANDARDIZE',
'NEUTRALIZE',
'DESALT'])),
('smiles_to_mol', SmilesToMolTransformer())])),
('feature_generator',
FeatureUnionWithHyperparameterRooting(transformer_list=[('Maccs',
MaccsFingerprints()),
('Desc',
ChemicalDescriptors(descriptor_prefix='rdkit_',
o...
DropDuplicateFeatures()),
('constant_selector',
DropConstantFeatures(missing_values='ignore')),
('correlation_selector',
SmartCorrelatedSelection(selection_method='variance',
threshold=0.9))]),
{ "input": { "file": "examples/notebooks/freesolv_train.csv", "separator": ",", "structure_col": "smiles", "target_columns": [ "expt" ], "group_col": "cat_col" }, "pipeline": { "memory": null, "verbose": false, "transform": "pandas", "n_jobs": null, "remainder": "passthrough", "verbose_feature_names_out": false }, "preprocessing": { "flags": [ "STANDARDIZE", "NEUTRALIZE", "FLATTEN_STEREOCHEMISTRY" ] }, "feature_generation": { "fingerprints": [ { "MorganFP": { "radius": 2, "fpSize": 1024, "includeChirality": false } } ], "maccs": true, "chemical_descriptors": { "omit_prefixes": [ "fr_", "FpDensity" ], "descriptor_prefix": "rdkit_", "descriptor_list": null }, "use_counts": false }, "cv": { "cv_type": "tanimoto_grouping", "n_splits": 5, "parameters": { "similarity_threshold": 0.4 } }, "model": { "categorical_features": [ "cat_col" ], "model_type": "regression", "target_type": "single_target", "feature_selection_type": "catboost", "correlation_threshold": 0.9, "feature_selection_flags": [ "DROP_CORRELATED", "DROP_CONSTANT", "DROP_DUPLICATES" ], "algorithm": "catboost", "parameters": { "iterations": null, "learning_rate": null, "max_depth": 6, "boosting_type": "Plain", "loss_function": "RMSE", "nan_mode": "Forbidden", "thread_count": 2, "l2_leaf_reg": null, "grow_policy": null, "model_size_reg": null, "rsm": null, "border_count": null, "feature_border_type": null, "per_float_feature_quantization": null, "input_borders": null, "output_borders": null, "fold_permutation_block": null, "od_pval": null, "od_wait": null, "od_type": null, "counter_calc_method": null, "leaf_estimation_iterations": null, "leaf_estimation_method": null, "random_seed": null, "use_best_model": null, "best_model_min_trees": null, "verbose": null, "silent": null, "logging_level": null, "metric_period": null, "ctr_leaf_count_limit": null, "store_all_simple_ctr": null, "max_ctr_complexity": null, "has_time": null, "allow_const_label": null, "target_border": null, "one_hot_max_size": null, "random_strength": null, "random_score_type": null, "name": null, "ignored_features": null, "train_dir": null, "custom_metric": null, "eval_metric": null, "bagging_temperature": null, "save_snapshot": null, "snapshot_file": null, "snapshot_interval": null, "fold_len_multiplier": null, "used_ram_limit": null, "gpu_ram_part": null, "pinned_memory_size": null, "allow_writing_files": null, "final_ctr_computation_mode": null, "approx_on_full_history": null, "simple_ctr": null, "combinations_ctr": null, "per_feature_ctr": null, "ctr_description": null, "ctr_target_border_count": null, "task_type": null, "device_config": null, "devices": null, "bootstrap_type": null, "subsample": null, "mvs_reg": null, "sampling_frequency": null, "sampling_unit": null, "dev_score_calc_obj_block_size": null, "dev_efb_max_buckets": null, "sparse_features_conflict_fraction": null, "n_estimators": null, "num_boost_round": null, "num_trees": null, "colsample_bylevel": null, "random_state": null, "reg_lambda": null, "objective": null, "eta": null, "max_bin": null, "gpu_cat_features_storage": null, "data_partition": null, "metadata": null, "early_stopping_rounds": null, "cat_features": null, "min_data_in_leaf": null, "min_child_samples": null, "max_leaves": null, "num_leaves": null, "score_function": null, "leaf_estimation_backtracking": null, "ctr_history_unit": null, "monotone_constraints": null, "feature_weights": null, "penalties_coefficient": null, "first_feature_use_penalties": null, "per_object_feature_penalties": null, "model_shrink_rate": null, "model_shrink_mode": null, "langevin": null, "diffusion_temperature": null, "posterior_sampling": null, "boost_from_average": null, "text_features": null, "tokenizers": null, "dictionaries": null, "feature_calcers": null, "text_processing": null, "embedding_features": null, "eval_fraction": null, "fixed_binary_splits": null }, "feature_selection_threshold": 0.0, "feature_selection_max_features": null }, "tuning": { "scorer": "r2", "direction": "maximize", "early_stopping_optuna": false, "n_trials_optuna": 2, "n_threads_optuna": 2, "n_startup_trials": 1, "seed": 42 } }
from mother.preprocessing import PreprocessingConfig, SmilesToMolTransformer, StandardizerTransformer
from sklearn import pipeline as sklearn_pipeline
from mother.settings import MotherSettings
my_settings: MotherSettings = MotherSettings.create()
preprocessor: sklearn_pipeline.Pipeline = sklearn_pipeline.Pipeline(
[
(
"smiles_standardizer",
StandardizerTransformer(**my_settings.preprocessing.model_dump()),
),
("smiles_to_mol", SmilesToMolTransformer()),
# Add other column transformations here if needed
],
memory=None,
).set_output(transform="pandas")
print(preprocessor)
Pipeline(steps=[('smiles_standardizer', StandardizerTransformer(flags=['STANDARDIZE', 'NEUTRALIZE', 'DESALT'])), ('smiles_to_mol', SmilesToMolTransformer())])
from mother.feature_generation import ChemicalDescriptors, MaccsFingerprints, MorganFingerprints
from mother.settings import MotherSettings
my_settings: MotherSettings = MotherSettings.create()
from sklearn import pipeline as sklearn_pipeline
feature_generator = sklearn_pipeline.FeatureUnion(transformer_list=[
("maccs", MaccsFingerprints()),
("morgan", MorganFingerprints(my_settings.feature_generation.fingerprints[0]["MorganFP"])),
("desc", ChemicalDescriptors(**my_settings.feature_generation.chemical_descriptors.model_dump())),
],
).set_output(transform="pandas")
print(feature_generator)
FeatureUnion(transformer_list=[('maccs', MaccsFingerprints()), ('morgan', MorganFingerprints(radius={'fpSize': 1024, 'includeChirality': False, 'radius': 2})), ('desc', ChemicalDescriptors(descriptor_prefix='rdkit_', omit_prefixes=('fr_', 'FpDensity')))])
from sklearn import pipeline as sklearn_pipeline
import sklearn.model_selection as skl_model_sel
import mother.pipeline_utils as mother_takes_care
from mother import ml
from mother.settings import MotherSettings
my_settings: MotherSettings = MotherSettings.create()
model = sklearn_pipeline.FeatureUnion(transformer_list=[
(
"feature_selector",
mother_takes_care.get_feature_selection_pipeline(
settings=my_settings,
cv=skl_model_sel.GroupKFold(n_splits=5),
).set_output(transform="pandas"),
),
("ml_model", ml.CatboostRegressorMother(target_type=my_settings.model.target_type, logging_level="Silent")),
])
print(model)
FeatureUnion(transformer_list=[('feature_selector',
ColumnTransformerWithHyperparameterRooting(remainder='passthrough',
transformers=[('feature_selector',
PipelineWithHyperparameterRooting(steps=[('duplicate_selector',
DropDuplicateFeatures()),
('constant_selector',
DropConstantFeatures(missing_values='ignore')),
('correlation_selector',
SmartCorrelatedSelection(selection_method='variance',
threshold=0.9))]),
MotherSettings class
The mother settings class can be used as convenience to store all project relevant settings in one place. To jump start your project a yaml file containing some defaults (examples) is provided. The MotherSettings class is based on pydantic models. This design choice was made to validate the input directly. Thus, avoiding a lot of if/else cases in the source code to handle user errors by wrong configuration. Thus, using the MotherSettings class is highly recommended and avoids a lot of configuration isues. Nevertheless, every transformer and a complete training pipeline can be created with this class.
Bases: BaseSettings
Source code in mother/settings.py
| Python | |
|---|---|
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 | |
create(file_path=None)
classmethod
Create a MotherSettings instance from a default YAML file and writes to given path.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
file_path
|
Optional[Union[Path, str]]
|
The path to the YAML file. Defaults to None. |
None
|
Returns:
| Name | Type | Description |
|---|---|---|
MotherSettings |
MotherSettings
|
An instance of MotherSettings with defaults. |
Logs
Logs the creation of the YAML file if file_path is provided.
Source code in mother/settings.py
dump_to_yaml(file_path)
Dumps the current model to a YAML file at the specified path.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
file_path
|
Path
|
The file path where the YAML content will be saved. |
required |
Raises:
| Type | Description |
|---|---|
OSError
|
If the file cannot be created or written to. |
Notes
- If the parent directory of the specified path does not exist, it will be created.
- If a file already exists at the specified path, it will be overwritten.
Source code in mother/settings.py
load_from_yaml(file_path)
classmethod
Load MotherSettings from a YAML file.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
file_path
|
Union[Path, str]
|
The path to the YAML configuration file. Defaults to config_file. |
required |
Returns:
| Name | Type | Description |
|---|---|---|
MotherSettings |
MotherSettings
|
An instance of MotherSettings populated with data from the YAML file. |
Raises:
| Type | Description |
|---|---|
ValueError
|
If the provided file does not exist. |
Source code in mother/settings.py
To dump the default configuration as yaml file and to be able to modify it to your needs you can use the provided function described above. In the same way, the created yaml file can be loaded again (see above).
Configuration Subsections
In the following every highlighted subsection represents the settings for a different transformer. A specialty is the model section which contains also the information for feature selection which can be used to create a more complex feature selection pipeline. For example, pipeline contains all the required parameters to configure a scikit learn pipeline. Thus, parameters can be set once and reused at different stages of your pipeline.
input:
file: examples/notebooks/freesolv_train.csv
separator: ","
structure_col: "smiles"
target_columns: ["expt"]
group_col: "cat_col" # e.g.: "validation_date" for time series column
pipeline:
memory: ~
transform: "pandas"
remainder: "passthrough"
verbose: False
n_jobs: ~
verbose_feature_names_out: False
preprocessing:
flags:
- "STANDARDIZE"
- "NEUTRALIZE"
- "DESALT"
feature_generation:
maccs: True
fingerprints:
- MorganFP:
radius: 2
fpSize: 1024
includeChirality: False
chemical_descriptors:
descriptor_prefix: "rdkit_"
omit_prefixes: ["fr_", "FpDensity"]
descriptor_list: ~
cv:
cv_type: "tanimoto_grouping"
parameters:
similarity_threshold: 0.4
tuning:
scorer: "r2"
early_stopping_optuna: False
n_trials_optuna: 2
n_threads_optuna: 2
n_startup_trials: 1
seed: 42
model:
categorical_features: ["cat_col"]
model_type: "regression"
target_type: "single_target"
feature_selection_type: "catboost"
algorithm: "catboost" # lasso or random_forest, xgboost, lightgbm, etc.
feature_selection_flags: ["DROP_CORRELATED", "DROP_CONSTANT", "DROP_DUPLICATES"]
feature_selection_threshold: 0
correlation_threshold: 0.9
parameters:
iterations: ~
learning_rate: ~
max_depth: 6
boosting_type: "Plain"
loss_function: 'RMSE'
nan_mode: "Forbidden"
thread_count: 2
l2_leaf_reg: ~
grow_policy: ~
model_size_reg: ~
rsm: ~
border_count: ~
feature_border_type: ~
per_float_feature_quantization: ~
input_borders: ~
output_borders: ~
fold_permutation_block: ~
od_pval: ~
od_wait: ~
od_type: ~
counter_calc_method: ~
leaf_estimation_iterations: ~
leaf_estimation_method: ~
random_seed: ~
use_best_model: ~
best_model_min_trees: ~
verbose: ~
silent: ~
logging_level: ~
metric_period: ~
ctr_leaf_count_limit: ~
store_all_simple_ctr: ~
max_ctr_complexity: ~
has_time: ~
allow_const_label: ~
target_border: ~
one_hot_max_size: ~
random_strength: ~
random_score_type: ~
name: ~
ignored_features: ~
train_dir: ~
custom_metric: ~
eval_metric: ~
bagging_temperature: ~
save_snapshot: ~
snapshot_file: ~
snapshot_interval: ~
fold_len_multiplier: ~
used_ram_limit: ~
gpu_ram_part: ~
pinned_memory_size: ~
allow_writing_files: ~
final_ctr_computation_mode: ~
approx_on_full_history: ~
simple_ctr: ~
combinations_ctr: ~
per_feature_ctr: ~
ctr_description: ~
ctr_target_border_count: ~
task_type: ~
device_config: ~
devices: ~
bootstrap_type: ~
subsample: ~
mvs_reg: ~
sampling_frequency: ~
sampling_unit: ~
dev_score_calc_obj_block_size: ~
dev_efb_max_buckets: ~
sparse_features_conflict_fraction: ~
n_estimators: ~
num_boost_round: ~
num_trees: ~
colsample_bylevel: ~
random_state: ~
reg_lambda: ~
objective: ~
eta: ~
max_bin: ~
gpu_cat_features_storage: ~
data_partition: ~
metadata: ~
early_stopping_rounds: ~
cat_features: ~
min_data_in_leaf: ~
min_child_samples: ~
max_leaves: ~
num_leaves: ~
score_function: ~
leaf_estimation_backtracking: ~
ctr_history_unit: ~
monotone_constraints: ~
feature_weights: ~
penalties_coefficient: ~
first_feature_use_penalties: ~
per_object_feature_penalties: ~
model_shrink_rate: ~
model_shrink_mode: ~
langevin: ~
diffusion_temperature: ~
posterior_sampling: ~
boost_from_average: ~
text_features: ~
tokenizers: ~
dictionaries: ~
feature_calcers: ~
text_processing: ~
embedding_features: ~
eval_fraction: ~
fixed_binary_splits : ~