Spaces:

ml-jku
/

tox21_rf_classifier

Sleeping

App Files Files Community

antoniaebner commited on Nov 11

Commit

db0fcf9

1 Parent(s): 6770901

add config usage

Browse files

Files changed (4) hide show

config/config.json +95 -0
predict.py +16 -11
src/utils.py +14 -0
train.py +36 -140

config/config.json ADDED Viewed

	@@ -0,0 +1,95 @@

+{
+    "seed": 0,
+    "ecfp_radius": 3,
+    "ecfp_fpsize": 8192,
+    "model_path": "checkpoints/rf_alltasks.joblib",
+    "data_folder": "data_tox21/",
+    "log_folder": "logs/",
+    "debug": 1,
+    "task_configs": {
+        "NR-AR": {
+            "max_depth": "none",
+            "max_features": "sqrt",
+            "min_samples_leaf": 1,
+            "min_samples_split": 5,
+            "n_estimators": 1000
+        },
+        "NR-AR-LBD": {
+            "max_depth": 12,
+            "max_features": "sqrt",
+            "min_samples_leaf": 1,
+            "min_samples_split": 5,
+            "n_estimators": 1000
+        },
+        "NR-AhR": {
+            "max_depth": "none",
+            "max_features": "log2",
+            "min_samples_leaf": 1,
+            "min_samples_split": 2,
+            "n_estimators": 1000
+        },
+        "NR-Aromatase": {
+            "max_depth": "none",
+            "max_features": "sqrt",
+            "min_samples_leaf": 4,
+            "min_samples_split": 12,
+            "n_estimators": 1000
+        },
+        "NR-ER": {
+            "max_depth": 10,
+            "max_features": "sqrt",
+            "min_samples_leaf": 1,
+            "min_samples_split": 2,
+            "n_estimators": 1000
+        },
+        "NR-ER-LBD": {
+            "max_depth": 8,
+            "max_features": "sqrt",
+            "min_samples_leaf": 2,
+            "min_samples_split": 5,
+            "n_estimators": 1000
+        },
+        "NR-PPAR-gamma": {
+            "max_depth": "none",
+            "max_features": "log2",
+            "min_samples_leaf": 1,
+            "min_samples_split": 2,
+            "n_estimators": 1000
+        },
+        "SR-ARE": {
+            "max_depth": "none",
+            "max_features": "sqrt",
+            "min_samples_leaf": 1,
+            "min_samples_split": 5,
+            "n_estimators": 1000
+        },
+        "SR-ATAD5": {
+            "max_depth": "none",
+            "max_features": "sqrt",
+            "min_samples_leaf": 1,
+            "min_samples_split": 2,
+            "n_estimators": 1000
+        },
+        "SR-HSE": {
+            "max_depth": 16,
+            "max_features": "log2",
+            "min_samples_leaf": 1,
+            "min_samples_split": 2,
+            "n_estimators": 1000
+        },
+        "SR-MMP": {
+            "max_depth": "none",
+            "max_features": "sqrt",
+            "min_samples_leaf": 2,
+            "min_samples_split": 2,
+            "n_estimators": 1000
+        },
+        "SR-p53": {
+            "max_depth": "none",
+            "max_features": "sqrt",
+            "min_samples_leaf": 1,
+            "min_samples_split": 2,
+            "n_estimators": 1000
+        }
+    }
+}

predict.py CHANGED Viewed

@@ -8,17 +8,16 @@ SMILES and target names as keys.
 # Dependencies
 from collections import defaultdict
 import numpy as np
 from tqdm import tqdm
 from src.preprocess import create_descriptors
-from src.utils import TASKS
 from src.model import Tox21RFClassifier
 # ---------------------------------------------------------------------------------------
-ECFP_RADIUS = 3
-ECFP_FPSIZE = 8192
-DEBUG = False
 def predict(
@@ -35,8 +34,12 @@ def predict(
     """
     print(f"Received {len(smiles_list)} SMILES strings")
     features, is_clean = create_descriptors(
-        smiles_list, radius=ECFP_RADIUS, fpsize=ECFP_FPSIZE
     )
     n_clean_mols, n_feats = features.shape
     print(f"Created {n_feats} descriptors for {n_clean_mols} molecules.")
@@ -44,10 +47,8 @@ def predict(
     # setup model
     model = Tox21RFClassifier()
-    model_path = "checkpoints/rf_alltasks.joblib"
-    model.load_model(model_path)
-    print(f"Loaded model from {model_path}")
     # make predicitons
     predictions = defaultdict(dict)
@@ -63,7 +64,7 @@ def predict(
         for smiles, pred in zip(smiles_list, preds):
             predictions[smiles][target] = float(pred)
-        if DEBUG:
             break
     return predictions
@@ -71,4 +72,8 @@ def predict(
 from testing import test_eval
-test_eval(predict, debug=DEBUG, use_only_clean=True, use_only_first=False)

 # Dependencies
 from collections import defaultdict
+import json
 import numpy as np
 from tqdm import tqdm
 from src.preprocess import create_descriptors
+from src.utils import TASKS, normalize_config
 from src.model import Tox21RFClassifier
 # ---------------------------------------------------------------------------------------
+CONFIG_FILE = "./config/config.json"
 def predict(
     """
     print(f"Received {len(smiles_list)} SMILES strings")
+    with open(CONFIG_FILE, "r") as f:
+        cfg = json.load(f)
+    cfg = normalize_config(cfg)
     features, is_clean = create_descriptors(
+        smiles_list, radius=cfg["ecfp_radius"], fpsize=cfg["ecfp_fpsize"]
     )
     n_clean_mols, n_feats = features.shape
     print(f"Created {n_feats} descriptors for {n_clean_mols} molecules.")
     # setup model
     model = Tox21RFClassifier()
+    model.load_model(cfg["model_path"])
+    print(f"Loaded model from {cfg['model_path']}")
     # make predicitons
     predictions = defaultdict(dict)
         for smiles, pred in zip(smiles_list, preds):
             predictions[smiles][target] = float(pred)
+        if cfg["debug"]:
             break
     return predictions
 from testing import test_eval
+with open(CONFIG_FILE, "r") as f:
+    cfg = json.load(f)
+cfg = normalize_config(cfg)
+test_eval(predict, debug=cfg["debug"], use_only_clean=False, use_only_first=False)

src/utils.py CHANGED Viewed

@@ -450,3 +450,17 @@ def create_dir(path, is_file=False):
     to_create = os.path.dirname(path) if is_file else path
     if not os.path.exists(to_create):
         os.makedirs(to_create)

     to_create = os.path.dirname(path) if is_file else path
     if not os.path.exists(to_create):
         os.makedirs(to_create)
+def normalize_config(config: dict):
+    """Normalizes a json config recursively by applying a mapping"""
+    mapping = {"none": None, "true": True, "false": False}
+    new_config = {}
+    for key, val in config.items():
+        if isinstance(val, dict):
+            new_config[key] = normalize_config(val)
+        elif val in mapping:
+            new_config[key] = mapping[val]
+        else:
+            new_config[key] = val
+    return new_config

train.py CHANGED Viewed

@@ -3,6 +3,7 @@ Script for fitting and saving any preprocessing assets, as well as the fitted RF
 """
 import os
 import random
 import logging
 import argparse
@@ -15,126 +16,20 @@ from datetime import datetime
 from src.model import Tox21RFClassifier
 from src.utils import (
     create_dir,
     USED_200_DESCR,
 )
-DEBUG = True
 parser = argparse.ArgumentParser(description="RF Training script for Tox21 dataset")
 parser.add_argument(
-    "--save_path",
-    type=str,
-    default="checkpoints/rf_alltasks.joblib",
-)
-parser.add_argument(
-    "--data_folder",
     type=str,
-    default="data/",
-)
-parser.add_argument(
-    "--seed",
-    type=int,
-    default=0,
-)
-parser.add_argument(
-    "--log_folder",
-    type=str,
-    default="logs/",
 )
-ECFP_RADIUS = 3
-ECFP_FPSIZE = 8192
-task_config = {
-    "NR-AR": {
-        "max_depth": None,
-        "max_features": "sqrt",
-        "min_samples_leaf": 1,
-        "min_samples_split": 5,
-        "n_estimators": 1000,
-    },
-    "NR-AR-LBD": {
-        "max_depth": 12,
-        "max_features": "sqrt",
-        "min_samples_leaf": 1,
-        "min_samples_split": 5,
-        "n_estimators": 1000,
-    },
-    "NR-AhR": {
-        "max_depth": None,
-        "max_features": "log2",
-        "min_samples_leaf": 1,
-        "min_samples_split": 2,
-        "n_estimators": 1000,
-    },  # {'cls__max_depth': None, 'cls__max_features': 'log2', 'cls__min_samples_leaf': 1, 'cls__min_samples_split': 2, 'cls__n_estimators': 1000}
-    "NR-Aromatase": {
-        "max_depth": None,
-        "max_features": "sqrt",
-        "min_samples_leaf": 4,
-        "min_samples_split": 12,
-        "n_estimators": 1000,
-    },  # {'cls__max_depth': None, 'cls__max_features': 'sqrt', 'cls__min_samples_leaf': 4, 'cls__min_samples_split': 12, 'cls__n_estimators': 1000}
-    "NR-ER": {
-        "max_depth": 10,
-        "max_features": "sqrt",
-        "min_samples_leaf": 1,
-        "min_samples_split": 2,
-        "n_estimators": 1000,
-    },  # {'cls__max_depth': 10, 'cls__max_features': 'sqrt', 'cls__min_samples_leaf': 1, 'cls__min_samples_split': 2, 'cls__n_estimators': 1000}
-    "NR-ER-LBD": {
-        "max_depth": 8,
-        "max_features": "sqrt",
-        "min_samples_leaf": 2,
-        "min_samples_split": 5,
-        "n_estimators": 1000,
-    },  # {'cls__max_depth': 8, 'cls__max_features': 'sqrt', 'cls__min_samples_leaf': 2, 'cls__min_samples_split': 5, 'cls__n_estimators': 1000}
-    "NR-PPAR-gamma": {
-        "max_depth": None,
-        "max_features": "log2",
-        "min_samples_leaf": 1,
-        "min_samples_split": 2,
-        "n_estimators": 1000,
-    },  # {'cls__max_depth': None, 'cls__max_features': 'log2', 'cls__min_samples_leaf': 1, 'cls__min_samples_split': 2, 'cls__n_estimators': 1000}
-    "SR-ARE": {
-        "max_depth": None,
-        "max_features": "sqrt",
-        "min_samples_leaf": 1,
-        "min_samples_split": 5,
-        "n_estimators": 1000,
-    },  # {'cls__max_depth': None, 'cls__max_features': 'sqrt', 'cls__min_samples_leaf': 1, 'cls__min_samples_split': 5, 'cls__n_estimators': 1000}
-    "SR-ATAD5": {
-        "max_depth": None,
-        "max_features": "sqrt",
-        "min_samples_leaf": 1,
-        "min_samples_split": 2,
-        "n_estimators": 1000,
-    },  # {'cls__max_depth': None, 'cls__max_features': 'sqrt', 'cls__min_samples_leaf': 1, 'cls__min_samples_split': 2, 'cls__n_estimators': 1000}
-    "SR-HSE": {
-        "max_depth": 16,
-        "max_features": "log2",
-        "min_samples_leaf": 1,
-        "min_samples_split": 2,
-        "n_estimators": 1000,
-    },  # {'cls__max_depth': 16, 'cls__max_features': 'log2', 'cls__min_samples_leaf': 1, 'cls__min_samples_split': 2, 'cls__n_estimators': 1000}
-    "SR-MMP": {
-        "max_depth": None,
-        "max_features": "sqrt",
-        "min_samples_leaf": 2,
-        "min_samples_split": 2,
-        "n_estimators": 1000,
-    },  # {'cls__max_depth': None, 'cls__max_features': 'sqrt', 'cls__min_samples_leaf': 2, 'cls__min_samples_split': 2, 'cls__n_estimators': 1000}
-    "SR-p53": {
-        "max_depth": None,
-        "max_features": "sqrt",
-        "min_samples_leaf": 1,
-        "min_samples_split": 2,
-        "n_estimators": 1000,
-    },  # {'cls__max_depth': None, 'cls__max_features': 'sqrt', 'cls__min_samples_leaf': 1, 'cls__min_samples_split': 2, 'cls__n_estimators': 1000}
-}
-def main(args):
     timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
     # setup logger
@@ -146,7 +41,7 @@ def main(args):
         handlers=[
             logging.FileHandler(
                 os.path.join(
-                    args.log_folder,
                     f"{script_name}_{timestamp}.log",
                 )
             ),
@@ -154,19 +49,25 @@ def main(args):
         ],
     )
-    logger.info(args)
     # seeding
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    train_data = np.load(os.path.join(args.data_folder, "tox21_train_cv4.npz"))
     train_X = train_data[
         "features"
     ]  # np.concatenate([train_data[descr] for descr in KNOWN_DESCR], axis=1)
     train_y = train_data["labels"]
-    val_data = np.load(os.path.join(args.data_folder, "tox21_validation_cv4.npz"))
     val_X = val_data[
         "features"
     ]  # np.concatenate([val_data[descr] for descr in KNOWN_DESCR], axis=1)
@@ -174,22 +75,13 @@ def main(args):
     data = np.concatenate([train_X, val_X], axis=0)
     labels = np.concatenate([train_y, val_y], axis=0)
-    # # remove molecules that couldn't be sanitized
-    # mask = ~np.isnan(train_X).any(axis=1)
-    # train_X = train_X[mask]
-    # train_y = train_y[mask]
     full_data = np.load(
         "data/tox21_descriptors.npz",
         allow_pickle=True,
     )
-    # train_val_mask = full_data["sets"] != "test"
-    # data = full_data["features"][train_val_mask]
-    # labels = full_data["labels"][train_val_mask]
-    print("Train data shape:", data.shape)
     test_mask = full_data["sets"] == "test"
     test_data = full_data["features"][test_mask]
     test_labels = full_data["labels"][test_mask]
@@ -197,43 +89,43 @@ def main(args):
         data.shape[1] == test_data.shape[1]
     ), "different number of features found in train and test set!"
-    if args.save_path:
         logger.info(
-            f"Fitted RandomForestClassifier will be saved in folder: {args.save_path}"
         )
     else:
         logger.info("Fitted RandomForestClassifier will NOT be saved.")
     rdkit_descr_idxs = np.arange(data.shape[1] - len(USED_200_DESCR), data.shape[1])
     model = Tox21RFClassifier(
-        seed=args.seed, task_config=task_config, rdkit_desc_idxs=rdkit_descr_idxs
     )
     logger.info("Start training.")
-    print("Start training.")
     for i, task in enumerate(model.tasks):
-        logger.info(f"Fitting task: {task}")
         task_labels = labels[:, i]
         label_mask = ~np.isnan(task_labels)
         task_data = data[label_mask]
         task_labels = task_labels[label_mask].astype(int)
-        print(f"Fit task {task} using {sum(label_mask)} samples")
         model.fit(task, task_data, task_labels)
     log_text = f"Finished training."
     logger.info(log_text)
-    if args.save_path:
-        model.save_model(args.save_path)
-        logger.info(f"Save model as: {args.save_path}")
     del model
     model = Tox21RFClassifier()
-    model.load_model(args.save_path)
-    print("Evaluate model")
     results = {}
     preds = np.empty_like(test_labels, dtype=np.float32)
     for i, task in enumerate(model.tasks):
@@ -256,6 +148,10 @@ def main(args):
 if __name__ == "__main__":
     args = parser.parse_args()
-    create_dir(args.log_folder)
-    main(args)

 """
 import os
+import json
 import random
 import logging
 import argparse
 from src.model import Tox21RFClassifier
 from src.utils import (
     create_dir,
+    normalize_config,
     USED_200_DESCR,
 )
 parser = argparse.ArgumentParser(description="RF Training script for Tox21 dataset")
 parser.add_argument(
+    "--config",
     type=str,
+    default="config/config.json",
 )
+def main(cfg):
     timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
     # setup logger
         handlers=[
             logging.FileHandler(
                 os.path.join(
+                    cfg["log_folder"],
                     f"{script_name}_{timestamp}.log",
                 )
             ),
         ],
     )
+    task_configs = cfg.pop("task_configs")
+    logger.info(f"Config: {cfg}")
+    task_configs_repr = "Task configs: \n" + "\n".join(
+        [str(val) for key, val in task_configs.items()]
+    )
+    logger.info(f"Task configs: \n{task_configs_repr}")
     # seeding
+    random.seed(cfg["seed"])
+    np.random.seed(cfg["seed"])
+    train_data = np.load(os.path.join(cfg["data_folder"], "tox21_train_cv4.npz"))
     train_X = train_data[
         "features"
     ]  # np.concatenate([train_data[descr] for descr in KNOWN_DESCR], axis=1)
     train_y = train_data["labels"]
+    val_data = np.load(os.path.join(cfg["data_folder"], "tox21_validation_cv4.npz"))
     val_X = val_data[
         "features"
     ]  # np.concatenate([val_data[descr] for descr in KNOWN_DESCR], axis=1)
     data = np.concatenate([train_X, val_X], axis=0)
     labels = np.concatenate([train_y, val_y], axis=0)
+    logger.info(f"Train data shape: {data.shape}")
     full_data = np.load(
         "data/tox21_descriptors.npz",
         allow_pickle=True,
     )
     test_mask = full_data["sets"] == "test"
     test_data = full_data["features"][test_mask]
     test_labels = full_data["labels"][test_mask]
         data.shape[1] == test_data.shape[1]
     ), "different number of features found in train and test set!"
+    if cfg["model_path"]:
         logger.info(
+            f"Fitted RandomForestClassifier will be saved as: {cfg['model_path']}"
         )
     else:
         logger.info("Fitted RandomForestClassifier will NOT be saved.")
     rdkit_descr_idxs = np.arange(data.shape[1] - len(USED_200_DESCR), data.shape[1])
     model = Tox21RFClassifier(
+        seed=cfg["seed"],
+        task_config=task_configs,
+        rdkit_desc_idxs=rdkit_descr_idxs,
     )
     logger.info("Start training.")
     for i, task in enumerate(model.tasks):
         task_labels = labels[:, i]
         label_mask = ~np.isnan(task_labels)
+        logger.info(f"Fit task {task} using {sum(label_mask)} samples")
         task_data = data[label_mask]
         task_labels = task_labels[label_mask].astype(int)
         model.fit(task, task_data, task_labels)
     log_text = f"Finished training."
     logger.info(log_text)
+    if cfg["model_path"]:
+        model.save_model(cfg["model_path"])
+        logger.info(f"Save model as: {cfg['model_path']}")
     del model
     model = Tox21RFClassifier()
+    model.load_model(cfg["model_path"])
+    logger.info("Evaluate model")
     results = {}
     preds = np.empty_like(test_labels, dtype=np.float32)
     for i, task in enumerate(model.tasks):
 if __name__ == "__main__":
     args = parser.parse_args()
+    with open(args.config, "r") as f:
+        cfg = json.load(f)
+    cfg = normalize_config(cfg)
+    create_dir(cfg["log_folder"])
+    main(cfg)