antoniaebner commited on
Commit
9fabbe2
·
1 Parent(s): db0fcf9

cleanup; remove eval from train.py

Browse files
Files changed (6) hide show
  1. config/config.json +1 -1
  2. predict.py +4 -4
  3. preprocess.py +0 -1
  4. src/model.py +2 -2
  5. src/preprocess.py +1 -5
  6. train.py +2 -37
config/config.json CHANGED
@@ -5,7 +5,7 @@
5
  "model_path": "checkpoints/rf_alltasks.joblib",
6
  "data_folder": "data_tox21/",
7
  "log_folder": "logs/",
8
- "debug": 1,
9
  "task_configs": {
10
  "NR-AR": {
11
  "max_depth": "none",
 
5
  "model_path": "checkpoints/rf_alltasks.joblib",
6
  "data_folder": "data_tox21/",
7
  "log_folder": "logs/",
8
+ "debug": "false",
9
  "task_configs": {
10
  "NR-AR": {
11
  "max_depth": "none",
predict.py CHANGED
@@ -10,8 +10,8 @@ from collections import defaultdict
10
 
11
  import json
12
  import numpy as np
13
-
14
  from tqdm import tqdm
 
15
  from src.preprocess import create_descriptors
16
  from src.utils import TASKS, normalize_config
17
  from src.model import Tox21RFClassifier
@@ -73,7 +73,7 @@ def predict(
73
  from testing import test_eval
74
 
75
  with open(CONFIG_FILE, "r") as f:
76
- cfg = json.load(f)
77
- cfg = normalize_config(cfg)
78
 
79
- test_eval(predict, debug=cfg["debug"], use_only_clean=False, use_only_first=False)
 
10
 
11
  import json
12
  import numpy as np
 
13
  from tqdm import tqdm
14
+
15
  from src.preprocess import create_descriptors
16
  from src.utils import TASKS, normalize_config
17
  from src.model import Tox21RFClassifier
 
73
  from testing import test_eval
74
 
75
  with open(CONFIG_FILE, "r") as f:
76
+ config = json.load(f)
77
+ config = normalize_config(config)
78
 
79
+ test_eval(predict, debug=config["debug"], use_only_clean=False, use_only_first=False)
preprocess.py CHANGED
@@ -15,7 +15,6 @@ from src.preprocess import create_descriptors, get_tox21_split
15
  from src.utils import (
16
  TASKS,
17
  HF_TOKEN,
18
- write_pickle,
19
  create_dir,
20
  )
21
 
 
15
  from src.utils import (
16
  TASKS,
17
  HF_TOKEN,
 
18
  create_dir,
19
  )
20
 
src/model.py CHANGED
@@ -11,10 +11,10 @@ import joblib
11
 
12
  import numpy as np
13
 
 
14
  from sklearn.ensemble import RandomForestClassifier
15
- from sklearn.preprocessing import StandardScaler
16
  from sklearn.feature_selection import VarianceThreshold
17
- from sklearn.base import BaseEstimator, TransformerMixin
18
  from statsmodels.distributions.empirical_distribution import ECDF
19
 
20
  from .utils import TASKS
 
11
 
12
  import numpy as np
13
 
14
+ from sklearn.base import BaseEstimator, TransformerMixin
15
  from sklearn.ensemble import RandomForestClassifier
 
16
  from sklearn.feature_selection import VarianceThreshold
17
+ from sklearn.preprocessing import StandardScaler
18
  from statsmodels.distributions.empirical_distribution import ECDF
19
 
20
  from .utils import TASKS
src/preprocess.py CHANGED
@@ -18,11 +18,7 @@ from rdkit import Chem, DataStructs
18
  from rdkit.Chem import Descriptors, rdFingerprintGenerator, MACCSkeys
19
  from rdkit.Chem.rdchem import Mol
20
 
21
- from .utils import (
22
- USED_200_DESCR,
23
- TOX_SMARTS_PATH,
24
- Standardizer,
25
- )
26
 
27
 
28
  def create_cleaned_mol_objects(smiles: list[str]) -> tuple[list[Mol], np.ndarray]:
 
18
  from rdkit.Chem import Descriptors, rdFingerprintGenerator, MACCSkeys
19
  from rdkit.Chem.rdchem import Mol
20
 
21
+ from .utils import USED_200_DESCR, TOX_SMARTS_PATH, Standardizer
 
 
 
 
22
 
23
 
24
  def create_cleaned_mol_objects(smiles: list[str]) -> tuple[list[Mol], np.ndarray]:
train.py CHANGED
@@ -9,8 +9,6 @@ import logging
9
  import argparse
10
 
11
  import numpy as np
12
-
13
- from sklearn.metrics import roc_auc_score
14
  from datetime import datetime
15
 
16
  from src.model import Tox21RFClassifier
@@ -77,18 +75,6 @@ def main(cfg):
77
  labels = np.concatenate([train_y, val_y], axis=0)
78
  logger.info(f"Train data shape: {data.shape}")
79
 
80
- full_data = np.load(
81
- "data/tox21_descriptors.npz",
82
- allow_pickle=True,
83
- )
84
-
85
- test_mask = full_data["sets"] == "test"
86
- test_data = full_data["features"][test_mask]
87
- test_labels = full_data["labels"][test_mask]
88
- assert (
89
- data.shape[1] == test_data.shape[1]
90
- ), "different number of features found in train and test set!"
91
-
92
  if cfg["model_path"]:
93
  logger.info(
94
  f"Fitted RandomForestClassifier will be saved as: {cfg['model_path']}"
@@ -113,6 +99,8 @@ def main(cfg):
113
  task_labels = task_labels[label_mask].astype(int)
114
 
115
  model.fit(task, task_data, task_labels)
 
 
116
 
117
  log_text = f"Finished training."
118
  logger.info(log_text)
@@ -121,29 +109,6 @@ def main(cfg):
121
  model.save_model(cfg["model_path"])
122
  logger.info(f"Save model as: {cfg['model_path']}")
123
 
124
- del model
125
- model = Tox21RFClassifier()
126
- model.load_model(cfg["model_path"])
127
-
128
- logger.info("Evaluate model")
129
- results = {}
130
- preds = np.empty_like(test_labels, dtype=np.float32)
131
- for i, task in enumerate(model.tasks):
132
- task_labels = test_labels[:, i]
133
- label_mask = ~np.isnan(task_labels)
134
-
135
- task_labels = task_labels[label_mask].astype(int)
136
-
137
- pred = model.predict(task, test_data)
138
- results[task] = [roc_auc_score(y_true=task_labels, y_score=pred[label_mask])]
139
- preds[:, i] = pred.copy()
140
-
141
- logger.info("Results:")
142
- logger.info(results)
143
- logger.info(
144
- f"Average: {sum([score[0] for score in results.values()]) / len(results)}"
145
- )
146
-
147
 
148
  if __name__ == "__main__":
149
  args = parser.parse_args()
 
9
  import argparse
10
 
11
  import numpy as np
 
 
12
  from datetime import datetime
13
 
14
  from src.model import Tox21RFClassifier
 
75
  labels = np.concatenate([train_y, val_y], axis=0)
76
  logger.info(f"Train data shape: {data.shape}")
77
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  if cfg["model_path"]:
79
  logger.info(
80
  f"Fitted RandomForestClassifier will be saved as: {cfg['model_path']}"
 
99
  task_labels = task_labels[label_mask].astype(int)
100
 
101
  model.fit(task, task_data, task_labels)
102
+ if cfg["debug"]:
103
+ break
104
 
105
  log_text = f"Finished training."
106
  logger.info(log_text)
 
109
  model.save_model(cfg["model_path"])
110
  logger.info(f"Save model as: {cfg['model_path']}")
111
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
  if __name__ == "__main__":
114
  args = parser.parse_args()