Spaces:
Sleeping
Sleeping
Commit
·
9fabbe2
1
Parent(s):
db0fcf9
cleanup; remove eval from train.py
Browse files- config/config.json +1 -1
- predict.py +4 -4
- preprocess.py +0 -1
- src/model.py +2 -2
- src/preprocess.py +1 -5
- train.py +2 -37
config/config.json
CHANGED
|
@@ -5,7 +5,7 @@
|
|
| 5 |
"model_path": "checkpoints/rf_alltasks.joblib",
|
| 6 |
"data_folder": "data_tox21/",
|
| 7 |
"log_folder": "logs/",
|
| 8 |
-
"debug":
|
| 9 |
"task_configs": {
|
| 10 |
"NR-AR": {
|
| 11 |
"max_depth": "none",
|
|
|
|
| 5 |
"model_path": "checkpoints/rf_alltasks.joblib",
|
| 6 |
"data_folder": "data_tox21/",
|
| 7 |
"log_folder": "logs/",
|
| 8 |
+
"debug": "false",
|
| 9 |
"task_configs": {
|
| 10 |
"NR-AR": {
|
| 11 |
"max_depth": "none",
|
predict.py
CHANGED
|
@@ -10,8 +10,8 @@ from collections import defaultdict
|
|
| 10 |
|
| 11 |
import json
|
| 12 |
import numpy as np
|
| 13 |
-
|
| 14 |
from tqdm import tqdm
|
|
|
|
| 15 |
from src.preprocess import create_descriptors
|
| 16 |
from src.utils import TASKS, normalize_config
|
| 17 |
from src.model import Tox21RFClassifier
|
|
@@ -73,7 +73,7 @@ def predict(
|
|
| 73 |
from testing import test_eval
|
| 74 |
|
| 75 |
with open(CONFIG_FILE, "r") as f:
|
| 76 |
-
|
| 77 |
-
|
| 78 |
|
| 79 |
-
test_eval(predict, debug=
|
|
|
|
| 10 |
|
| 11 |
import json
|
| 12 |
import numpy as np
|
|
|
|
| 13 |
from tqdm import tqdm
|
| 14 |
+
|
| 15 |
from src.preprocess import create_descriptors
|
| 16 |
from src.utils import TASKS, normalize_config
|
| 17 |
from src.model import Tox21RFClassifier
|
|
|
|
| 73 |
from testing import test_eval
|
| 74 |
|
| 75 |
with open(CONFIG_FILE, "r") as f:
|
| 76 |
+
config = json.load(f)
|
| 77 |
+
config = normalize_config(config)
|
| 78 |
|
| 79 |
+
test_eval(predict, debug=config["debug"], use_only_clean=False, use_only_first=False)
|
preprocess.py
CHANGED
|
@@ -15,7 +15,6 @@ from src.preprocess import create_descriptors, get_tox21_split
|
|
| 15 |
from src.utils import (
|
| 16 |
TASKS,
|
| 17 |
HF_TOKEN,
|
| 18 |
-
write_pickle,
|
| 19 |
create_dir,
|
| 20 |
)
|
| 21 |
|
|
|
|
| 15 |
from src.utils import (
|
| 16 |
TASKS,
|
| 17 |
HF_TOKEN,
|
|
|
|
| 18 |
create_dir,
|
| 19 |
)
|
| 20 |
|
src/model.py
CHANGED
|
@@ -11,10 +11,10 @@ import joblib
|
|
| 11 |
|
| 12 |
import numpy as np
|
| 13 |
|
|
|
|
| 14 |
from sklearn.ensemble import RandomForestClassifier
|
| 15 |
-
from sklearn.preprocessing import StandardScaler
|
| 16 |
from sklearn.feature_selection import VarianceThreshold
|
| 17 |
-
from sklearn.
|
| 18 |
from statsmodels.distributions.empirical_distribution import ECDF
|
| 19 |
|
| 20 |
from .utils import TASKS
|
|
|
|
| 11 |
|
| 12 |
import numpy as np
|
| 13 |
|
| 14 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
| 15 |
from sklearn.ensemble import RandomForestClassifier
|
|
|
|
| 16 |
from sklearn.feature_selection import VarianceThreshold
|
| 17 |
+
from sklearn.preprocessing import StandardScaler
|
| 18 |
from statsmodels.distributions.empirical_distribution import ECDF
|
| 19 |
|
| 20 |
from .utils import TASKS
|
src/preprocess.py
CHANGED
|
@@ -18,11 +18,7 @@ from rdkit import Chem, DataStructs
|
|
| 18 |
from rdkit.Chem import Descriptors, rdFingerprintGenerator, MACCSkeys
|
| 19 |
from rdkit.Chem.rdchem import Mol
|
| 20 |
|
| 21 |
-
from .utils import
|
| 22 |
-
USED_200_DESCR,
|
| 23 |
-
TOX_SMARTS_PATH,
|
| 24 |
-
Standardizer,
|
| 25 |
-
)
|
| 26 |
|
| 27 |
|
| 28 |
def create_cleaned_mol_objects(smiles: list[str]) -> tuple[list[Mol], np.ndarray]:
|
|
|
|
| 18 |
from rdkit.Chem import Descriptors, rdFingerprintGenerator, MACCSkeys
|
| 19 |
from rdkit.Chem.rdchem import Mol
|
| 20 |
|
| 21 |
+
from .utils import USED_200_DESCR, TOX_SMARTS_PATH, Standardizer
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
|
| 24 |
def create_cleaned_mol_objects(smiles: list[str]) -> tuple[list[Mol], np.ndarray]:
|
train.py
CHANGED
|
@@ -9,8 +9,6 @@ import logging
|
|
| 9 |
import argparse
|
| 10 |
|
| 11 |
import numpy as np
|
| 12 |
-
|
| 13 |
-
from sklearn.metrics import roc_auc_score
|
| 14 |
from datetime import datetime
|
| 15 |
|
| 16 |
from src.model import Tox21RFClassifier
|
|
@@ -77,18 +75,6 @@ def main(cfg):
|
|
| 77 |
labels = np.concatenate([train_y, val_y], axis=0)
|
| 78 |
logger.info(f"Train data shape: {data.shape}")
|
| 79 |
|
| 80 |
-
full_data = np.load(
|
| 81 |
-
"data/tox21_descriptors.npz",
|
| 82 |
-
allow_pickle=True,
|
| 83 |
-
)
|
| 84 |
-
|
| 85 |
-
test_mask = full_data["sets"] == "test"
|
| 86 |
-
test_data = full_data["features"][test_mask]
|
| 87 |
-
test_labels = full_data["labels"][test_mask]
|
| 88 |
-
assert (
|
| 89 |
-
data.shape[1] == test_data.shape[1]
|
| 90 |
-
), "different number of features found in train and test set!"
|
| 91 |
-
|
| 92 |
if cfg["model_path"]:
|
| 93 |
logger.info(
|
| 94 |
f"Fitted RandomForestClassifier will be saved as: {cfg['model_path']}"
|
|
@@ -113,6 +99,8 @@ def main(cfg):
|
|
| 113 |
task_labels = task_labels[label_mask].astype(int)
|
| 114 |
|
| 115 |
model.fit(task, task_data, task_labels)
|
|
|
|
|
|
|
| 116 |
|
| 117 |
log_text = f"Finished training."
|
| 118 |
logger.info(log_text)
|
|
@@ -121,29 +109,6 @@ def main(cfg):
|
|
| 121 |
model.save_model(cfg["model_path"])
|
| 122 |
logger.info(f"Save model as: {cfg['model_path']}")
|
| 123 |
|
| 124 |
-
del model
|
| 125 |
-
model = Tox21RFClassifier()
|
| 126 |
-
model.load_model(cfg["model_path"])
|
| 127 |
-
|
| 128 |
-
logger.info("Evaluate model")
|
| 129 |
-
results = {}
|
| 130 |
-
preds = np.empty_like(test_labels, dtype=np.float32)
|
| 131 |
-
for i, task in enumerate(model.tasks):
|
| 132 |
-
task_labels = test_labels[:, i]
|
| 133 |
-
label_mask = ~np.isnan(task_labels)
|
| 134 |
-
|
| 135 |
-
task_labels = task_labels[label_mask].astype(int)
|
| 136 |
-
|
| 137 |
-
pred = model.predict(task, test_data)
|
| 138 |
-
results[task] = [roc_auc_score(y_true=task_labels, y_score=pred[label_mask])]
|
| 139 |
-
preds[:, i] = pred.copy()
|
| 140 |
-
|
| 141 |
-
logger.info("Results:")
|
| 142 |
-
logger.info(results)
|
| 143 |
-
logger.info(
|
| 144 |
-
f"Average: {sum([score[0] for score in results.values()]) / len(results)}"
|
| 145 |
-
)
|
| 146 |
-
|
| 147 |
|
| 148 |
if __name__ == "__main__":
|
| 149 |
args = parser.parse_args()
|
|
|
|
| 9 |
import argparse
|
| 10 |
|
| 11 |
import numpy as np
|
|
|
|
|
|
|
| 12 |
from datetime import datetime
|
| 13 |
|
| 14 |
from src.model import Tox21RFClassifier
|
|
|
|
| 75 |
labels = np.concatenate([train_y, val_y], axis=0)
|
| 76 |
logger.info(f"Train data shape: {data.shape}")
|
| 77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
if cfg["model_path"]:
|
| 79 |
logger.info(
|
| 80 |
f"Fitted RandomForestClassifier will be saved as: {cfg['model_path']}"
|
|
|
|
| 99 |
task_labels = task_labels[label_mask].astype(int)
|
| 100 |
|
| 101 |
model.fit(task, task_data, task_labels)
|
| 102 |
+
if cfg["debug"]:
|
| 103 |
+
break
|
| 104 |
|
| 105 |
log_text = f"Finished training."
|
| 106 |
logger.info(log_text)
|
|
|
|
| 109 |
model.save_model(cfg["model_path"])
|
| 110 |
logger.info(f"Save model as: {cfg['model_path']}")
|
| 111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
if __name__ == "__main__":
|
| 114 |
args = parser.parse_args()
|