antoniaebner commited on
Commit
db0fcf9
·
1 Parent(s): 6770901

add config usage

Browse files
Files changed (4) hide show
  1. config/config.json +95 -0
  2. predict.py +16 -11
  3. src/utils.py +14 -0
  4. train.py +36 -140
config/config.json ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "seed": 0,
3
+ "ecfp_radius": 3,
4
+ "ecfp_fpsize": 8192,
5
+ "model_path": "checkpoints/rf_alltasks.joblib",
6
+ "data_folder": "data_tox21/",
7
+ "log_folder": "logs/",
8
+ "debug": 1,
9
+ "task_configs": {
10
+ "NR-AR": {
11
+ "max_depth": "none",
12
+ "max_features": "sqrt",
13
+ "min_samples_leaf": 1,
14
+ "min_samples_split": 5,
15
+ "n_estimators": 1000
16
+ },
17
+ "NR-AR-LBD": {
18
+ "max_depth": 12,
19
+ "max_features": "sqrt",
20
+ "min_samples_leaf": 1,
21
+ "min_samples_split": 5,
22
+ "n_estimators": 1000
23
+ },
24
+ "NR-AhR": {
25
+ "max_depth": "none",
26
+ "max_features": "log2",
27
+ "min_samples_leaf": 1,
28
+ "min_samples_split": 2,
29
+ "n_estimators": 1000
30
+ },
31
+ "NR-Aromatase": {
32
+ "max_depth": "none",
33
+ "max_features": "sqrt",
34
+ "min_samples_leaf": 4,
35
+ "min_samples_split": 12,
36
+ "n_estimators": 1000
37
+ },
38
+ "NR-ER": {
39
+ "max_depth": 10,
40
+ "max_features": "sqrt",
41
+ "min_samples_leaf": 1,
42
+ "min_samples_split": 2,
43
+ "n_estimators": 1000
44
+ },
45
+ "NR-ER-LBD": {
46
+ "max_depth": 8,
47
+ "max_features": "sqrt",
48
+ "min_samples_leaf": 2,
49
+ "min_samples_split": 5,
50
+ "n_estimators": 1000
51
+ },
52
+ "NR-PPAR-gamma": {
53
+ "max_depth": "none",
54
+ "max_features": "log2",
55
+ "min_samples_leaf": 1,
56
+ "min_samples_split": 2,
57
+ "n_estimators": 1000
58
+ },
59
+ "SR-ARE": {
60
+ "max_depth": "none",
61
+ "max_features": "sqrt",
62
+ "min_samples_leaf": 1,
63
+ "min_samples_split": 5,
64
+ "n_estimators": 1000
65
+ },
66
+ "SR-ATAD5": {
67
+ "max_depth": "none",
68
+ "max_features": "sqrt",
69
+ "min_samples_leaf": 1,
70
+ "min_samples_split": 2,
71
+ "n_estimators": 1000
72
+ },
73
+ "SR-HSE": {
74
+ "max_depth": 16,
75
+ "max_features": "log2",
76
+ "min_samples_leaf": 1,
77
+ "min_samples_split": 2,
78
+ "n_estimators": 1000
79
+ },
80
+ "SR-MMP": {
81
+ "max_depth": "none",
82
+ "max_features": "sqrt",
83
+ "min_samples_leaf": 2,
84
+ "min_samples_split": 2,
85
+ "n_estimators": 1000
86
+ },
87
+ "SR-p53": {
88
+ "max_depth": "none",
89
+ "max_features": "sqrt",
90
+ "min_samples_leaf": 1,
91
+ "min_samples_split": 2,
92
+ "n_estimators": 1000
93
+ }
94
+ }
95
+ }
predict.py CHANGED
@@ -8,17 +8,16 @@ SMILES and target names as keys.
8
  # Dependencies
9
  from collections import defaultdict
10
 
 
11
  import numpy as np
12
 
13
  from tqdm import tqdm
14
  from src.preprocess import create_descriptors
15
- from src.utils import TASKS
16
  from src.model import Tox21RFClassifier
17
 
18
  # ---------------------------------------------------------------------------------------
19
- ECFP_RADIUS = 3
20
- ECFP_FPSIZE = 8192
21
- DEBUG = False
22
 
23
 
24
  def predict(
@@ -35,8 +34,12 @@ def predict(
35
  """
36
  print(f"Received {len(smiles_list)} SMILES strings")
37
 
 
 
 
 
38
  features, is_clean = create_descriptors(
39
- smiles_list, radius=ECFP_RADIUS, fpsize=ECFP_FPSIZE
40
  )
41
  n_clean_mols, n_feats = features.shape
42
  print(f"Created {n_feats} descriptors for {n_clean_mols} molecules.")
@@ -44,10 +47,8 @@ def predict(
44
 
45
  # setup model
46
  model = Tox21RFClassifier()
47
- model_path = "checkpoints/rf_alltasks.joblib"
48
- model.load_model(model_path)
49
-
50
- print(f"Loaded model from {model_path}")
51
 
52
  # make predicitons
53
  predictions = defaultdict(dict)
@@ -63,7 +64,7 @@ def predict(
63
 
64
  for smiles, pred in zip(smiles_list, preds):
65
  predictions[smiles][target] = float(pred)
66
- if DEBUG:
67
  break
68
 
69
  return predictions
@@ -71,4 +72,8 @@ def predict(
71
 
72
  from testing import test_eval
73
 
74
- test_eval(predict, debug=DEBUG, use_only_clean=True, use_only_first=False)
 
 
 
 
 
8
  # Dependencies
9
  from collections import defaultdict
10
 
11
+ import json
12
  import numpy as np
13
 
14
  from tqdm import tqdm
15
  from src.preprocess import create_descriptors
16
+ from src.utils import TASKS, normalize_config
17
  from src.model import Tox21RFClassifier
18
 
19
  # ---------------------------------------------------------------------------------------
20
+ CONFIG_FILE = "./config/config.json"
 
 
21
 
22
 
23
  def predict(
 
34
  """
35
  print(f"Received {len(smiles_list)} SMILES strings")
36
 
37
+ with open(CONFIG_FILE, "r") as f:
38
+ cfg = json.load(f)
39
+ cfg = normalize_config(cfg)
40
+
41
  features, is_clean = create_descriptors(
42
+ smiles_list, radius=cfg["ecfp_radius"], fpsize=cfg["ecfp_fpsize"]
43
  )
44
  n_clean_mols, n_feats = features.shape
45
  print(f"Created {n_feats} descriptors for {n_clean_mols} molecules.")
 
47
 
48
  # setup model
49
  model = Tox21RFClassifier()
50
+ model.load_model(cfg["model_path"])
51
+ print(f"Loaded model from {cfg['model_path']}")
 
 
52
 
53
  # make predicitons
54
  predictions = defaultdict(dict)
 
64
 
65
  for smiles, pred in zip(smiles_list, preds):
66
  predictions[smiles][target] = float(pred)
67
+ if cfg["debug"]:
68
  break
69
 
70
  return predictions
 
72
 
73
  from testing import test_eval
74
 
75
+ with open(CONFIG_FILE, "r") as f:
76
+ cfg = json.load(f)
77
+ cfg = normalize_config(cfg)
78
+
79
+ test_eval(predict, debug=cfg["debug"], use_only_clean=False, use_only_first=False)
src/utils.py CHANGED
@@ -450,3 +450,17 @@ def create_dir(path, is_file=False):
450
  to_create = os.path.dirname(path) if is_file else path
451
  if not os.path.exists(to_create):
452
  os.makedirs(to_create)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
450
  to_create = os.path.dirname(path) if is_file else path
451
  if not os.path.exists(to_create):
452
  os.makedirs(to_create)
453
+
454
+
455
+ def normalize_config(config: dict):
456
+ """Normalizes a json config recursively by applying a mapping"""
457
+ mapping = {"none": None, "true": True, "false": False}
458
+ new_config = {}
459
+ for key, val in config.items():
460
+ if isinstance(val, dict):
461
+ new_config[key] = normalize_config(val)
462
+ elif val in mapping:
463
+ new_config[key] = mapping[val]
464
+ else:
465
+ new_config[key] = val
466
+ return new_config
train.py CHANGED
@@ -3,6 +3,7 @@ Script for fitting and saving any preprocessing assets, as well as the fitted RF
3
  """
4
 
5
  import os
 
6
  import random
7
  import logging
8
  import argparse
@@ -15,126 +16,20 @@ from datetime import datetime
15
  from src.model import Tox21RFClassifier
16
  from src.utils import (
17
  create_dir,
 
18
  USED_200_DESCR,
19
  )
20
 
21
- DEBUG = True
22
-
23
  parser = argparse.ArgumentParser(description="RF Training script for Tox21 dataset")
24
 
25
  parser.add_argument(
26
- "--save_path",
27
- type=str,
28
- default="checkpoints/rf_alltasks.joblib",
29
- )
30
- parser.add_argument(
31
- "--data_folder",
32
  type=str,
33
- default="data/",
34
- )
35
- parser.add_argument(
36
- "--seed",
37
- type=int,
38
- default=0,
39
- )
40
- parser.add_argument(
41
- "--log_folder",
42
- type=str,
43
- default="logs/",
44
  )
45
 
46
- ECFP_RADIUS = 3
47
- ECFP_FPSIZE = 8192
48
-
49
- task_config = {
50
- "NR-AR": {
51
- "max_depth": None,
52
- "max_features": "sqrt",
53
- "min_samples_leaf": 1,
54
- "min_samples_split": 5,
55
- "n_estimators": 1000,
56
- },
57
- "NR-AR-LBD": {
58
- "max_depth": 12,
59
- "max_features": "sqrt",
60
- "min_samples_leaf": 1,
61
- "min_samples_split": 5,
62
- "n_estimators": 1000,
63
- },
64
- "NR-AhR": {
65
- "max_depth": None,
66
- "max_features": "log2",
67
- "min_samples_leaf": 1,
68
- "min_samples_split": 2,
69
- "n_estimators": 1000,
70
- }, # {'cls__max_depth': None, 'cls__max_features': 'log2', 'cls__min_samples_leaf': 1, 'cls__min_samples_split': 2, 'cls__n_estimators': 1000}
71
- "NR-Aromatase": {
72
- "max_depth": None,
73
- "max_features": "sqrt",
74
- "min_samples_leaf": 4,
75
- "min_samples_split": 12,
76
- "n_estimators": 1000,
77
- }, # {'cls__max_depth': None, 'cls__max_features': 'sqrt', 'cls__min_samples_leaf': 4, 'cls__min_samples_split': 12, 'cls__n_estimators': 1000}
78
- "NR-ER": {
79
- "max_depth": 10,
80
- "max_features": "sqrt",
81
- "min_samples_leaf": 1,
82
- "min_samples_split": 2,
83
- "n_estimators": 1000,
84
- }, # {'cls__max_depth': 10, 'cls__max_features': 'sqrt', 'cls__min_samples_leaf': 1, 'cls__min_samples_split': 2, 'cls__n_estimators': 1000}
85
- "NR-ER-LBD": {
86
- "max_depth": 8,
87
- "max_features": "sqrt",
88
- "min_samples_leaf": 2,
89
- "min_samples_split": 5,
90
- "n_estimators": 1000,
91
- }, # {'cls__max_depth': 8, 'cls__max_features': 'sqrt', 'cls__min_samples_leaf': 2, 'cls__min_samples_split': 5, 'cls__n_estimators': 1000}
92
- "NR-PPAR-gamma": {
93
- "max_depth": None,
94
- "max_features": "log2",
95
- "min_samples_leaf": 1,
96
- "min_samples_split": 2,
97
- "n_estimators": 1000,
98
- }, # {'cls__max_depth': None, 'cls__max_features': 'log2', 'cls__min_samples_leaf': 1, 'cls__min_samples_split': 2, 'cls__n_estimators': 1000}
99
- "SR-ARE": {
100
- "max_depth": None,
101
- "max_features": "sqrt",
102
- "min_samples_leaf": 1,
103
- "min_samples_split": 5,
104
- "n_estimators": 1000,
105
- }, # {'cls__max_depth': None, 'cls__max_features': 'sqrt', 'cls__min_samples_leaf': 1, 'cls__min_samples_split': 5, 'cls__n_estimators': 1000}
106
- "SR-ATAD5": {
107
- "max_depth": None,
108
- "max_features": "sqrt",
109
- "min_samples_leaf": 1,
110
- "min_samples_split": 2,
111
- "n_estimators": 1000,
112
- }, # {'cls__max_depth': None, 'cls__max_features': 'sqrt', 'cls__min_samples_leaf': 1, 'cls__min_samples_split': 2, 'cls__n_estimators': 1000}
113
- "SR-HSE": {
114
- "max_depth": 16,
115
- "max_features": "log2",
116
- "min_samples_leaf": 1,
117
- "min_samples_split": 2,
118
- "n_estimators": 1000,
119
- }, # {'cls__max_depth': 16, 'cls__max_features': 'log2', 'cls__min_samples_leaf': 1, 'cls__min_samples_split': 2, 'cls__n_estimators': 1000}
120
- "SR-MMP": {
121
- "max_depth": None,
122
- "max_features": "sqrt",
123
- "min_samples_leaf": 2,
124
- "min_samples_split": 2,
125
- "n_estimators": 1000,
126
- }, # {'cls__max_depth': None, 'cls__max_features': 'sqrt', 'cls__min_samples_leaf': 2, 'cls__min_samples_split': 2, 'cls__n_estimators': 1000}
127
- "SR-p53": {
128
- "max_depth": None,
129
- "max_features": "sqrt",
130
- "min_samples_leaf": 1,
131
- "min_samples_split": 2,
132
- "n_estimators": 1000,
133
- }, # {'cls__max_depth': None, 'cls__max_features': 'sqrt', 'cls__min_samples_leaf': 1, 'cls__min_samples_split': 2, 'cls__n_estimators': 1000}
134
- }
135
-
136
-
137
- def main(args):
138
  timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
139
 
140
  # setup logger
@@ -146,7 +41,7 @@ def main(args):
146
  handlers=[
147
  logging.FileHandler(
148
  os.path.join(
149
- args.log_folder,
150
  f"{script_name}_{timestamp}.log",
151
  )
152
  ),
@@ -154,19 +49,25 @@ def main(args):
154
  ],
155
  )
156
 
157
- logger.info(args)
 
 
 
 
 
 
158
 
159
  # seeding
160
- random.seed(args.seed)
161
- np.random.seed(args.seed)
162
 
163
- train_data = np.load(os.path.join(args.data_folder, "tox21_train_cv4.npz"))
164
  train_X = train_data[
165
  "features"
166
  ] # np.concatenate([train_data[descr] for descr in KNOWN_DESCR], axis=1)
167
  train_y = train_data["labels"]
168
 
169
- val_data = np.load(os.path.join(args.data_folder, "tox21_validation_cv4.npz"))
170
  val_X = val_data[
171
  "features"
172
  ] # np.concatenate([val_data[descr] for descr in KNOWN_DESCR], axis=1)
@@ -174,22 +75,13 @@ def main(args):
174
 
175
  data = np.concatenate([train_X, val_X], axis=0)
176
  labels = np.concatenate([train_y, val_y], axis=0)
177
-
178
- # # remove molecules that couldn't be sanitized
179
- # mask = ~np.isnan(train_X).any(axis=1)
180
- # train_X = train_X[mask]
181
- # train_y = train_y[mask]
182
 
183
  full_data = np.load(
184
  "data/tox21_descriptors.npz",
185
  allow_pickle=True,
186
  )
187
 
188
- # train_val_mask = full_data["sets"] != "test"
189
- # data = full_data["features"][train_val_mask]
190
- # labels = full_data["labels"][train_val_mask]
191
- print("Train data shape:", data.shape)
192
-
193
  test_mask = full_data["sets"] == "test"
194
  test_data = full_data["features"][test_mask]
195
  test_labels = full_data["labels"][test_mask]
@@ -197,43 +89,43 @@ def main(args):
197
  data.shape[1] == test_data.shape[1]
198
  ), "different number of features found in train and test set!"
199
 
200
- if args.save_path:
201
  logger.info(
202
- f"Fitted RandomForestClassifier will be saved in folder: {args.save_path}"
203
  )
204
  else:
205
  logger.info("Fitted RandomForestClassifier will NOT be saved.")
206
 
207
  rdkit_descr_idxs = np.arange(data.shape[1] - len(USED_200_DESCR), data.shape[1])
208
  model = Tox21RFClassifier(
209
- seed=args.seed, task_config=task_config, rdkit_desc_idxs=rdkit_descr_idxs
 
 
210
  )
211
 
212
  logger.info("Start training.")
213
- print("Start training.")
214
  for i, task in enumerate(model.tasks):
215
- logger.info(f"Fitting task: {task}")
216
  task_labels = labels[:, i]
217
  label_mask = ~np.isnan(task_labels)
 
218
 
219
  task_data = data[label_mask]
220
  task_labels = task_labels[label_mask].astype(int)
221
 
222
- print(f"Fit task {task} using {sum(label_mask)} samples")
223
  model.fit(task, task_data, task_labels)
224
 
225
  log_text = f"Finished training."
226
  logger.info(log_text)
227
 
228
- if args.save_path:
229
- model.save_model(args.save_path)
230
- logger.info(f"Save model as: {args.save_path}")
231
 
232
  del model
233
  model = Tox21RFClassifier()
234
- model.load_model(args.save_path)
235
 
236
- print("Evaluate model")
237
  results = {}
238
  preds = np.empty_like(test_labels, dtype=np.float32)
239
  for i, task in enumerate(model.tasks):
@@ -256,6 +148,10 @@ def main(args):
256
  if __name__ == "__main__":
257
  args = parser.parse_args()
258
 
259
- create_dir(args.log_folder)
 
 
 
 
260
 
261
- main(args)
 
3
  """
4
 
5
  import os
6
+ import json
7
  import random
8
  import logging
9
  import argparse
 
16
  from src.model import Tox21RFClassifier
17
  from src.utils import (
18
  create_dir,
19
+ normalize_config,
20
  USED_200_DESCR,
21
  )
22
 
 
 
23
  parser = argparse.ArgumentParser(description="RF Training script for Tox21 dataset")
24
 
25
  parser.add_argument(
26
+ "--config",
 
 
 
 
 
27
  type=str,
28
+ default="config/config.json",
 
 
 
 
 
 
 
 
 
 
29
  )
30
 
31
+
32
+ def main(cfg):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
34
 
35
  # setup logger
 
41
  handlers=[
42
  logging.FileHandler(
43
  os.path.join(
44
+ cfg["log_folder"],
45
  f"{script_name}_{timestamp}.log",
46
  )
47
  ),
 
49
  ],
50
  )
51
 
52
+ task_configs = cfg.pop("task_configs")
53
+ logger.info(f"Config: {cfg}")
54
+ task_configs_repr = "Task configs: \n" + "\n".join(
55
+ [str(val) for key, val in task_configs.items()]
56
+ )
57
+
58
+ logger.info(f"Task configs: \n{task_configs_repr}")
59
 
60
  # seeding
61
+ random.seed(cfg["seed"])
62
+ np.random.seed(cfg["seed"])
63
 
64
+ train_data = np.load(os.path.join(cfg["data_folder"], "tox21_train_cv4.npz"))
65
  train_X = train_data[
66
  "features"
67
  ] # np.concatenate([train_data[descr] for descr in KNOWN_DESCR], axis=1)
68
  train_y = train_data["labels"]
69
 
70
+ val_data = np.load(os.path.join(cfg["data_folder"], "tox21_validation_cv4.npz"))
71
  val_X = val_data[
72
  "features"
73
  ] # np.concatenate([val_data[descr] for descr in KNOWN_DESCR], axis=1)
 
75
 
76
  data = np.concatenate([train_X, val_X], axis=0)
77
  labels = np.concatenate([train_y, val_y], axis=0)
78
+ logger.info(f"Train data shape: {data.shape}")
 
 
 
 
79
 
80
  full_data = np.load(
81
  "data/tox21_descriptors.npz",
82
  allow_pickle=True,
83
  )
84
 
 
 
 
 
 
85
  test_mask = full_data["sets"] == "test"
86
  test_data = full_data["features"][test_mask]
87
  test_labels = full_data["labels"][test_mask]
 
89
  data.shape[1] == test_data.shape[1]
90
  ), "different number of features found in train and test set!"
91
 
92
+ if cfg["model_path"]:
93
  logger.info(
94
+ f"Fitted RandomForestClassifier will be saved as: {cfg['model_path']}"
95
  )
96
  else:
97
  logger.info("Fitted RandomForestClassifier will NOT be saved.")
98
 
99
  rdkit_descr_idxs = np.arange(data.shape[1] - len(USED_200_DESCR), data.shape[1])
100
  model = Tox21RFClassifier(
101
+ seed=cfg["seed"],
102
+ task_config=task_configs,
103
+ rdkit_desc_idxs=rdkit_descr_idxs,
104
  )
105
 
106
  logger.info("Start training.")
 
107
  for i, task in enumerate(model.tasks):
 
108
  task_labels = labels[:, i]
109
  label_mask = ~np.isnan(task_labels)
110
+ logger.info(f"Fit task {task} using {sum(label_mask)} samples")
111
 
112
  task_data = data[label_mask]
113
  task_labels = task_labels[label_mask].astype(int)
114
 
 
115
  model.fit(task, task_data, task_labels)
116
 
117
  log_text = f"Finished training."
118
  logger.info(log_text)
119
 
120
+ if cfg["model_path"]:
121
+ model.save_model(cfg["model_path"])
122
+ logger.info(f"Save model as: {cfg['model_path']}")
123
 
124
  del model
125
  model = Tox21RFClassifier()
126
+ model.load_model(cfg["model_path"])
127
 
128
+ logger.info("Evaluate model")
129
  results = {}
130
  preds = np.empty_like(test_labels, dtype=np.float32)
131
  for i, task in enumerate(model.tasks):
 
148
  if __name__ == "__main__":
149
  args = parser.parse_args()
150
 
151
+ with open(args.config, "r") as f:
152
+ cfg = json.load(f)
153
+ cfg = normalize_config(cfg)
154
+
155
+ create_dir(cfg["log_folder"])
156
 
157
+ main(cfg)