Holmes commited on Mar 19, 2025

Commit

ca7299e

1 Parent(s): 1af230e

test

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +134 -3
analysis/Ramachandran_plot.py +99 -0
analysis/__pycache__/Ramachandran_plot.cpython-310.pyc +0 -0
analysis/__pycache__/merge_pred_pdb.cpython-310.pyc +0 -0
analysis/__pycache__/metrics.cpython-310.pyc +0 -0
analysis/__pycache__/utils.cpython-310.pyc +0 -0
analysis/__pycache__/utils.cpython-38.pyc +0 -0
analysis/eval_result.py +66 -0
analysis/merge_pred_pdb.py +45 -0
analysis/metrics.py +54 -0
analysis/pca_analyse.py +116 -0
analysis/src/__init__.py +0 -0
analysis/src/__pycache__/__init__.cpython-310.pyc +0 -0
analysis/src/__pycache__/__init__.cpython-37.pyc +0 -0
analysis/src/__pycache__/__init__.cpython-39.pyc +0 -0
analysis/src/__pycache__/eval.cpython-310.pyc +0 -0
analysis/src/__pycache__/eval.cpython-37.pyc +0 -0
analysis/src/__pycache__/eval.cpython-39.pyc +0 -0
analysis/src/common/__init__.py +0 -0
analysis/src/common/__pycache__/__init__.cpython-310.pyc +0 -0
analysis/src/common/__pycache__/__init__.cpython-39.pyc +0 -0
analysis/src/common/__pycache__/all_atom.cpython-39.pyc +0 -0
analysis/src/common/__pycache__/data_transforms.cpython-39.pyc +0 -0
analysis/src/common/__pycache__/geo_utils.cpython-310.pyc +0 -0
analysis/src/common/__pycache__/geo_utils.cpython-39.pyc +0 -0
analysis/src/common/__pycache__/pdb_utils.cpython-310.pyc +0 -0
analysis/src/common/__pycache__/pdb_utils.cpython-39.pyc +0 -0
analysis/src/common/__pycache__/protein.cpython-310.pyc +0 -0
analysis/src/common/__pycache__/protein.cpython-39.pyc +0 -0
analysis/src/common/__pycache__/residue_constants.cpython-310.pyc +0 -0
analysis/src/common/__pycache__/residue_constants.cpython-39.pyc +0 -0
analysis/src/common/__pycache__/rigid_utils.cpython-39.pyc +0 -0
analysis/src/common/__pycache__/rotation3d.cpython-39.pyc +0 -0
analysis/src/common/all_atom.py +219 -0
analysis/src/common/data_transforms.py +1194 -0
analysis/src/common/geo_utils.py +155 -0
analysis/src/common/pdb_utils.py +353 -0
analysis/src/common/protein.py +289 -0
analysis/src/common/residue_constants.py +897 -0
analysis/src/common/rigid_utils.py +1451 -0
analysis/src/common/rotation3d.py +596 -0
analysis/src/data/__init__.py +0 -0
analysis/src/data/__pycache__/__init__.cpython-39.pyc +0 -0
analysis/src/data/__pycache__/protein_datamodule.cpython-39.pyc +0 -0
analysis/src/data/components/__init__.py +0 -0
analysis/src/data/components/__pycache__/__init__.cpython-39.pyc +0 -0
analysis/src/data/components/__pycache__/dataset.cpython-39.pyc +0 -0
analysis/src/data/components/dataset.py +321 -0
analysis/src/data/protein_datamodule.py +242 -0
analysis/src/eval.py +217 -0

README.md CHANGED Viewed

@@ -1,3 +1,134 @@
----
-license: gpl-3.0
----

+# P2DFlow
+> ## ℹ️ The version 2 of codes for P2DFlow will come soon to align with the new revised paper, and for easier use and modification (the old version can run correctly)
+P2DFlow is a protein ensemble generative model with SE(3) flow matching based on ESMFold, the ensembles generated by P2DFlow could aid in understanding protein functions across various scenarios.
+Technical details and evaluation results are provided in our paper:
+* [P2DFlow: A Protein Ensemble Generative Model with SE(3) Flow Matching](https://arxiv.org/abs/2411.17196)
+<p align="center">
+    <img src="resources/workflow.jpg" width="600"/>
+</p>
+![P2DFlow](resources/gen_example.gif)
+## Table of Contents
+1. [Installation](#Installation)
+2. [Prepare Dataset](#Prepare-Dataset)
+3. [Model weights](#Model-weights)
+4. [Training](#Training)
+5. [Inference](#Inference)
+6. [Evaluation](#Evaluation)
+7. [License](#License)
+8. [Citation](#Citation)
+## Installation
+In an environment with cuda 11.7, run:
+```
+conda env create -f environment.yml
+```
+To activate the environment, run:
+```
+conda activate P2DFlow
+```
+## Prepare Dataset
+#### (tips: If you want to use the data we have preprocessed, please go directly to `3. Process selected dataset`; if you prefer to process the data from scratch or work with your own data, please start from the beginning)
+#### 1. Download raw ATLAS dataset
+(i) Download the `Analysis & MDs` dataset from [ATLAS](https://www.dsimb.inserm.fr/ATLAS/), or you can use `./dataset/download.py` by running:
+```
+python ./dataset/download.py
+```
+We will use `.pdb` and `.xtc` files for the following calculation.
+#### 2. Calculate the 'approximate energy and select representative structures
+(i) Use `gaussian_kde` to calculate the 'approximate energy' (You need to put all files above in `./dataset`, include `ATLAS_filename.txt` for filenames of all proteins):
+```
+python ./dataset/traj_analyse.py
+```
+And you will get `traj_info.csv`.
+(ii) Select representative structures at equal intervals based on the 'approximate energy':
+```
+python ./dataset/md_select.py
+```
+#### 3. Process selected dataset
+(i) Download the selected dataset (or get it from the two steps above) from [Google Drive](https://drive.google.com/drive/folders/11mdVfMi2rpVn7nNG2mQAGA5sNXCKePZj?usp=sharing) whose filename is `selected_dataset.tar`, and decompress it using:
+```
+tar -xvf select_dataset.tar
+```
+(ii) Preprocess `.pdb` files to get `.pkl` files:
+```
+python ./data/process_pdb_files.py --pdb_dir ${pdb_dir} --write_dir ${write_dir}
+```
+And you will get `metadata.csv`.
+then compute node representation and pair representation using ESM-2 (`csv_path` is the path of `metadata.csv`):
+```
+python ./data/cal_repr.py --csv_path ${csv_path}
+```
+then compute predicted static structure using ESMFold (`csv_path` is the path of `metadata.csv`):
+```
+python ./data/cal_static_structure.py --csv_path ${csv_path}
+```
+(iii) Provide the necessary `.csv` files for training
+If you are using the data we have preprocessed, download the `.csv` files from [Google Drive](https://drive.google.com/drive/folders/11mdVfMi2rpVn7nNG2mQAGA5sNXCKePZj?usp=sharing) whose filenames are `train_dataset.csv` and `train_dataset_energy.csv`(they correspond to `csv_path` and `energy_csv_path` in `./configs/base.yaml` during training).
+Or if you are using your own data, you can get `metadata.csv` from step 3 (correspond to `csv_path` in `./configs/base.yaml` during training, and you need to split a subset from it as the train dataset), and get `traj_info.csv` from step 2 (correspond to `energy_csv_path`).
+## Model weights
+Download the pretrained checkpoint from [Google Drive](https://drive.google.com/drive/folders/11mdVfMi2rpVn7nNG2mQAGA5sNXCKePZj?usp=sharing) whose filename is `pretrained.ckpt`, and put it into `./weights` folder. You can use the pretrained weight for inference.
+## Training
+To train P2DFlow, firstly make sure you have prepared the dataset according to `Prepare Dataset`, and put it in the right folder, then modify `./configs/base.yaml` (especially for `csv_path` and `energy_csv_path`). After this, you can run:
+```
+python experiments/train_se3_flows.py
+```
+And you will get the checkpoints in `./ckpt`.
+## Inference
+To infer for specified protein sequence, firstly modify `./inference/valid_seq.csv` and `./configs/inference.yaml` (especially for `validset_path`), then run:
+```
+python experiments/inference_se3_flows.py
+```
+And you will get the results in `./inference_outputs/weights/`.
+## Evaluation
+To evaluate metrics related to fidelity and dynamics, specify paths in `./analysis/eval_test.py`, then run:
+```
+python ./analysis/eval_test.py
+```
+To evaluate PCA, specify paths in `./analysis/pca_analyse.py`, then run:
+```
+python ./analysis/pca_analyse.py
+```
+To draw the ramachandran plots, specify paths in `./analysis/Ramachandran_plot.py`, then run:
+```
+python ./analysis/Ramachandran_plot.py
+```
+## License
+This project is licensed under the terms of the GPL-3.0 license.
+## Citation
+```
+@article{jin2024p2dflow,
+  title={P2DFlow: A Protein Ensemble Generative Model with SE(3) Flow Matching},
+  author={Yaowei Jin, Qi Huang, Ziyang Song, Mingyue Zheng, Dan Teng, Qian Shi},
+  journal={arXiv preprint arXiv:2411.17196},
+  year={2024}
+}
+```

analysis/Ramachandran_plot.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import MDAnalysis as mda
+import numpy as np
+from MDAnalysis.analysis.dihedrals import Ramachandran
+import os
+import re
+import pandas as pd
+import matplotlib.pyplot as plt
+def ramachandran_eval(all_paths, pdb_file, output_dir):
+    angle_results_all = []
+    for dirpath in all_paths:
+        pdb_path = os.path.join(dirpath,pdb_file)
+        u = mda.Universe(pdb_path)
+        protein = u.select_atoms('protein')
+        # print('There are {} residues in the protein'.format(len(protein.residues)))
+        ramachandran = Ramachandran(protein)
+        ramachandran.run()
+        angle_results = ramachandran.results.angles
+        # print(angle_results.shape)
+        # ramachandran.plot(color='black', marker='.')
+        angle_results_all.append(angle_results.reshape([-1,2]))
+        # df = pd.DataFrame(angle_results.reshape([-1,2]))
+        # df.to_csv(os.path.join(output_dir, os.path.basename(dirpath)+'_'+pdb_file.split('.')[0]+'.csv'), index=False)
+    points1 = angle_results_all[0]
+    grid_size = 360  # 网格的大小
+    x_bins = np.linspace(-180, 180, grid_size)
+    y_bins = np.linspace(-180, 180, grid_size)
+    result_tmp={}
+    for idx in range(len(angle_results_all[1:])):
+        idx = idx + 1
+        points2 = angle_results_all[idx]
+        # 使用2D直方图统计每组点在网格上的分布
+        hist1, _, _ = np.histogram2d(points1[:, 0], points1[:, 1], bins=[x_bins, y_bins])
+        hist2, _, _ = np.histogram2d(points2[:, 0], points2[:, 1], bins=[x_bins, y_bins])
+        # 将直方图转换为布尔值，表示某个网格是否有点落入
+        mask1 = hist1 > 0
+        mask2 = hist2 > 0
+        intersection = np.logical_and(mask1, mask2).sum()
+        all_mask2 = mask2.sum()
+        val_ratio = intersection / all_mask2
+        print(os.path.basename(all_paths[idx]), "val_ratio:", val_ratio)
+        result_tmp[os.path.basename(all_paths[idx])] = val_ratio
+    result_tmp['file'] = pdb_file
+    return result_tmp
+if __name__ == "__main__":
+    key1 = 'P2DFlow_epoch19'
+    all_paths = [
+                "/cluster/home/shiqian/frame-flow-test1/valid/evaluate/ATLAS_valid",
+                # "/cluster/home/shiqian/frame-flow-test1/valid/evaluate/esm_n_pred",
+                "/cluster/home/shiqian/frame-flow-test1/valid/evaluate/alphaflow_pred",
+                "/cluster/home/shiqian/frame-flow-test1/valid/evaluate/Str2Str_pred",
+                f'/cluster/home/shiqian/frame-flow-test1/valid/evaluate/{key1}',
+                ]
+    output_dir = '/cluster/home/shiqian/frame-flow-test1/valid/evaluate/Ramachandran'
+    os.makedirs(output_dir, exist_ok=True)
+    results={
+            'file':[],
+            # 'esm_n_pred':[],
+            'alphaflow_pred':[],
+            'Str2Str_pred':[],
+            key1:[],
+            }
+    for file in os.listdir(all_paths[0]):
+        if re.search('\.pdb',file):
+            pdb_file = file
+            print(file)
+            result_tmp = ramachandran_eval(
+                all_paths=all_paths,
+                pdb_file=pdb_file,
+                output_dir=output_dir
+            )
+            for key in results.keys():
+                results[key].append(result_tmp[key])
+    out_total_df = pd.DataFrame(results)
+    out_total_df.to_csv(os.path.join(output_dir,f'Ramachandran_plot_validity_{key1}.csv'),index=False)

analysis/__pycache__/Ramachandran_plot.cpython-310.pyc ADDED Viewed

Binary file (2.25 kB). View file

analysis/__pycache__/merge_pred_pdb.cpython-310.pyc ADDED Viewed

Binary file (1.28 kB). View file

analysis/__pycache__/metrics.cpython-310.pyc ADDED Viewed

Binary file (2.07 kB). View file

analysis/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (2.2 kB). View file

analysis/__pycache__/utils.cpython-38.pyc ADDED Viewed

Binary file (2.18 kB). View file

analysis/eval_result.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import os
+import re
+import sys
+sys.path.append('./analysis')
+import argparse
+import pandas as pd
+from src.eval import evaluate_prediction
+from merge_pred_pdb import merge_pdb_full
+from Ramachandran_plot import ramachandran_eval
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pred_org_dir", type=str, default="./inference_outputs/weights/pretrained/2025-03-13_10-08")
+    parser.add_argument("--valid_csv_file", type=str, default="./inference/valid_seq.csv")
+    parser.add_argument("--pred_merge_dir", type=str, default="./inference/test/pred_merge_results")
+    parser.add_argument("--target_dir", type=str, default="./inference/test/target_dir")
+    parser.add_argument("--crystal_dir", type=str, default="./inference/test/crystal_dir")
+    args = parser.parse_args()
+    # merge pdb
+    pred_org_dir = args.pred_org_dir
+    valid_csv_file = args.valid_csv_file
+    pred_merge_dir = args.pred_merge_dir
+    merge_pdb_full(pred_org_dir, valid_csv_file, pred_merge_dir)
+    # cal_eval
+    pred_merge_dir = args.pred_merge_dir
+    target_dir = args.target_dir
+    crystal_dir = args.crystal_dir
+    evaluate_prediction(pred_merge_dir, target_dir, crystal_dir)
+    # cal_RP
+    all_paths = [
+                args.target_dir,
+                args.pred_merge_dir,
+                ]
+    results={}
+    for file in os.listdir(all_paths[0]):
+        if re.search('\.pdb',file):
+            pdb_file = file
+            print(file)
+            result_tmp = ramachandran_eval(
+                all_paths=all_paths,
+                pdb_file=pdb_file,
+                output_dir=args.pred_merge_dir,
+            )
+            for pred_paths in all_paths[1:]:
+                key_name = os.path.basename(pred_paths)
+                if key_name is results.keys():
+                    results[key_name].append(result_tmp[key_name])
+                else:
+                    results[key_name] = [result_tmp[key_name]]
+    out_total_df = pd.DataFrame(results)
+    out_total_df.to_csv(os.path.join(args.pred_merge_dir, f'Ramachandran_plot_validity.csv'), index=False)
+    print(f"RP results saved to {os.path.join(args.pred_merge_dir, f'Ramachandran_plot_validity.csv')}")

analysis/merge_pred_pdb.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import os
+import re
+import pandas as pd
+from Bio.PDB import PDBParser, PDBIO
+def merge_pdb(work_dir, new_file, ref_pdb):
+    parser = PDBParser()
+    structures = []
+    for pdb_dir in os.listdir(work_dir):
+        pattern=".*"+ref_pdb
+        pdb_dir_full=os.path.join(work_dir,pdb_dir)
+        if os.path.isdir(pdb_dir_full) and re.match(pattern,pdb_dir):
+            for pdb_file in os.listdir(pdb_dir_full):
+                if re.match("sample.*\.pdb",pdb_file):
+                    structure = parser.get_structure(pdb_file, os.path.join(work_dir,pdb_dir,pdb_file))
+                    structures.append(structure)
+    if len(structures) == 0:
+        return
+    print(ref_pdb,len(structures),"files")
+    new_structure = structures[0]
+    count = 0
+    for structure in structures[1:]:
+        for model in structure:
+            count += 1
+            # print(dir(model))
+            model.id = count
+            new_structure.add(model)
+    io = PDBIO()
+    io.set_structure(new_structure)
+    io.save(new_file)
+def merge_pdb_full(inference_dir_f, valid_csv, output_dir):
+    os.makedirs(output_dir,exist_ok=True)
+    valid_set = pd.read_csv(valid_csv)
+    for filename in valid_set['file']:
+        output_file = os.path.join(output_dir, filename+".pdb")
+        merge_pdb(inference_dir_f, output_file, filename)

analysis/metrics.py ADDED Viewed

	@@ -0,0 +1,54 @@

+""" Metrics. """
+import mdtraj as md
+import numpy as np
+from openfold.np import residue_constants
+from tmtools import tm_align
+from data import utils as du
+def calc_tm_score(pos_1, pos_2, seq_1, seq_2):
+    tm_results = tm_align(pos_1, pos_2, seq_1, seq_2)
+    return tm_results.tm_norm_chain1, tm_results.tm_norm_chain2
+def calc_mdtraj_metrics(pdb_path):
+    try:
+        traj = md.load(pdb_path)
+        pdb_ss = md.compute_dssp(traj, simplified=True)
+        pdb_coil_percent = np.mean(pdb_ss == 'C')
+        pdb_helix_percent = np.mean(pdb_ss == 'H')
+        pdb_strand_percent = np.mean(pdb_ss == 'E')
+        pdb_ss_percent = pdb_helix_percent + pdb_strand_percent
+        pdb_rg = md.compute_rg(traj)[0]
+    except IndexError as e:
+        print('Error in calc_mdtraj_metrics: {}'.format(e))
+        pdb_ss_percent = 0.0
+        pdb_coil_percent = 0.0
+        pdb_helix_percent = 0.0
+        pdb_strand_percent = 0.0
+        pdb_rg = 0.0
+    return {
+        'non_coil_percent': pdb_ss_percent,
+        'coil_percent': pdb_coil_percent,
+        'helix_percent': pdb_helix_percent,
+        'strand_percent': pdb_strand_percent,
+        'radius_of_gyration': pdb_rg,
+    }
+def calc_aligned_rmsd(pos_1, pos_2):
+    aligned_pos_1 = du.rigid_transform_3D(pos_1, pos_2)[0]
+    return np.mean(np.linalg.norm(aligned_pos_1 - pos_2, axis=-1))
+def calc_ca_ca_metrics(ca_pos, bond_tol=0.1, clash_tol=1.0):
+    ca_bond_dists = np.linalg.norm(
+        ca_pos - np.roll(ca_pos, 1, axis=0), axis=-1)[1:]
+    ca_ca_dev = np.mean(np.abs(ca_bond_dists - residue_constants.ca_ca))
+    ca_ca_valid = np.mean(ca_bond_dists < (residue_constants.ca_ca + bond_tol))
+    ca_ca_dists2d = np.linalg.norm(
+        ca_pos[:, None, :] - ca_pos[None, :, :], axis=-1)
+    inter_dists = ca_ca_dists2d[np.where(np.triu(ca_ca_dists2d, k=0) > 0)]
+    clashes = inter_dists < clash_tol
+    return {
+        'ca_ca_deviation': ca_ca_dev,
+        'ca_ca_valid_percent': ca_ca_valid,
+        'num_ca_ca_clashes': np.sum(clashes),
+    }

analysis/pca_analyse.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import os
+import re
+import pandas as pd
+import MDAnalysis as mda
+from MDAnalysis.analysis import pca, align, rms
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import warnings
+import argparse
+warnings.filterwarnings("ignore")
+def cal_PCA(md_pdb_path,ref_path,pred_pdb_path,n_components = 2):
+    print("")
+    print('filename=',os.path.basename(ref_path))
+    u = mda.Universe(md_pdb_path, md_pdb_path)
+    u_ref = mda.Universe(ref_path, ref_path)
+    aligner = align.AlignTraj(u,
+                              u_ref,
+                              select='name CA or name C or name N',
+                              in_memory=True).run()
+    pc = pca.PCA(u,
+                select='name CA or name C or name N',
+                align=False, mean=None,
+                # n_components=None,
+                n_components=n_components,
+                ).run()
+    backbone = u.select_atoms('name CA or name C or name N')
+    n_bb = len(backbone)
+    print('There are {} backbone atoms in the analysis'.format(n_bb))
+    for i in range(n_components):
+        print(f"Cumulated variance {i+1}: {pc.cumulated_variance[i]:.3f}")
+    transformed = pc.transform(backbone, n_components=n_components)
+    print(transformed.shape)  # (3000, 2)
+    df = pd.DataFrame(transformed,
+                    columns=['PC{}'.format(i+1) for i in range(n_components)])
+    plt.scatter(df['PC1'],df['PC2'],marker='o')
+    plt.show()
+    output_dir = os.path.dirname(md_pdb_path)
+    output_filename = os.path.basename(md_pdb_path).split('.')[0]
+    df.to_csv(os.path.join(output_dir, f'{output_filename}_md_pca.csv'))
+    plt.savefig(os.path.join(output_dir, f'{output_filename}_md_pca.png'))
+    for k,v in pred_pdb_path.items():
+        u_pred = mda.Universe(v, v)
+        aligner = align.AlignTraj(u_pred,
+                            u_ref,
+                            select='name CA or name C or name N',
+                            in_memory=True).run()
+        pred_backbone = u_pred.select_atoms('name CA or name C or name N')
+        pred_transformed = pc.transform(pred_backbone, n_components=n_components)
+        df = pd.DataFrame(pred_transformed,
+                        columns=['PC{}'.format(i+1) for i in range(n_components)])
+        plt.scatter(df['PC1'],df['PC2'],marker='o')
+        plt.show()
+        output_dir = os.path.dirname(v)
+        output_filename = os.path.basename(v).split('.')[0]
+        df.to_csv(os.path.join(output_dir, f'{output_filename}_{k}_pca.csv'))
+        plt.savefig(os.path.join(output_dir, f'{output_filename}_{k}_pca.png'))
+    plt.clf()
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pred_pdb_dir", type=str, default="./inference/test/pred_merge_results")
+    parser.add_argument("--target_dir", type=str, default="./inference/test/target_dir")
+    parser.add_argument("--crystal_dir", type=str, default="./inference/test/crystal_dir")
+    args = parser.parse_args()
+    pred_pdb_path_org={
+        'P2DFlow':args.pred_pdb_dir,
+    }
+    md_pdb_path_org = args.target_dir
+    ref_path_org = args.crystal_dir
+    for file in os.listdir(md_pdb_path_org):
+        if re.search('\.pdb',file):
+            pred_pdb_path={
+                'P2DFlow':'',
+                # 'alphaflow':'',
+                # 'Str2Str':'',
+            }
+            for k,v in pred_pdb_path.items():
+                pred_pdb_path[k]=os.path.join(pred_pdb_path_org[k],file)
+            md_pdb_path = os.path.join(md_pdb_path_org, file)
+            ref_path = os.path.join(ref_path_org, file)
+            cal_PCA(md_pdb_path,ref_path,pred_pdb_path)

analysis/src/__init__.py ADDED Viewed

File without changes

analysis/src/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (138 Bytes). View file

analysis/src/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (132 Bytes). View file

analysis/src/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (136 Bytes). View file

analysis/src/__pycache__/eval.cpython-310.pyc ADDED Viewed

Binary file (3.09 kB). View file

analysis/src/__pycache__/eval.cpython-37.pyc ADDED Viewed

Binary file (4.61 kB). View file

analysis/src/__pycache__/eval.cpython-39.pyc ADDED Viewed

Binary file (4.95 kB). View file

analysis/src/common/__init__.py ADDED Viewed

File without changes

analysis/src/common/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (164 Bytes). View file

analysis/src/common/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (143 Bytes). View file

analysis/src/common/__pycache__/all_atom.cpython-39.pyc ADDED Viewed

Binary file (5.15 kB). View file

analysis/src/common/__pycache__/data_transforms.cpython-39.pyc ADDED Viewed

Binary file (26.9 kB). View file

analysis/src/common/__pycache__/geo_utils.cpython-310.pyc ADDED Viewed

Binary file (5.02 kB). View file

analysis/src/common/__pycache__/geo_utils.cpython-39.pyc ADDED Viewed

Binary file (5 kB). View file

analysis/src/common/__pycache__/pdb_utils.cpython-310.pyc ADDED Viewed

Binary file (10.5 kB). View file

analysis/src/common/__pycache__/pdb_utils.cpython-39.pyc ADDED Viewed

Binary file (10.4 kB). View file

analysis/src/common/__pycache__/protein.cpython-310.pyc ADDED Viewed

Binary file (7.46 kB). View file

analysis/src/common/__pycache__/protein.cpython-39.pyc ADDED Viewed

Binary file (7.45 kB). View file

analysis/src/common/__pycache__/residue_constants.cpython-310.pyc ADDED Viewed

Binary file (23.7 kB). View file

analysis/src/common/__pycache__/residue_constants.cpython-39.pyc ADDED Viewed

Binary file (23.2 kB). View file

analysis/src/common/__pycache__/rigid_utils.cpython-39.pyc ADDED Viewed

Binary file (41.4 kB). View file

analysis/src/common/__pycache__/rotation3d.cpython-39.pyc ADDED Viewed

Binary file (17 kB). View file

analysis/src/common/all_atom.py ADDED Viewed

	@@ -0,0 +1,219 @@

+"""
+Utilities for calculating all atom representations.
+"""
+import torch
+from src.common import residue_constants as rc
+from src.common.data_transforms import atom37_to_torsion_angles
+from src.common.rigid_utils import Rigid, Rotation
+# Residue Constants from OpenFold/AlphaFold2.
+IDEALIZED_POS37 = torch.tensor(rc.restype_atom37_rigid_group_positions)
+IDEALIZED_POS37_MASK = torch.any(IDEALIZED_POS37, axis=-1)
+IDEALIZED_POS = torch.tensor(rc.restype_atom14_rigid_group_positions)
+DEFAULT_FRAMES = torch.tensor(rc.restype_rigid_group_default_frame)
+ATOM_MASK = torch.tensor(rc.restype_atom14_mask)
+GROUP_IDX = torch.tensor(rc.restype_atom14_to_rigid_group)
+def torsion_angles_to_frames(
+    r: Rigid,
+    alpha: torch.Tensor,
+    aatype: torch.Tensor,
+):
+    # [*, N, 8, 4, 4]
+    default_4x4 = DEFAULT_FRAMES[aatype, ...].to(r.device)
+    # [*, N, 8] transformations, i.e.
+    #   One [*, N, 8, 3, 3] rotation matrix and
+    #   One [*, N, 8, 3]    translation matrix
+    default_r = r.from_tensor_4x4(default_4x4)
+    bb_rot = alpha.new_zeros((*((1,) * len(alpha.shape[:-1])), 2))
+    bb_rot[..., 1] = 1
+    # [*, N, 8, 2]
+    alpha = torch.cat(
+        [bb_rot.expand(*alpha.shape[:-2], -1, -1), alpha], dim=-2
+    )
+    # [*, N, 8, 3, 3]
+    # Produces rotation matrices of the form:
+    # [
+    #   [1, 0  , 0  ],
+    #   [0, a_2,-a_1],
+    #   [0, a_1, a_2]
+    # ]
+    # This follows the original code rather than the supplement, which uses
+    # different indices.
+    all_rots = alpha.new_zeros(default_r.get_rots().get_rot_mats().shape)
+    all_rots[..., 0, 0] = 1
+    all_rots[..., 1, 1] = alpha[..., 1]
+    all_rots[..., 1, 2] = -alpha[..., 0]
+    all_rots[..., 2, 1:] = alpha
+    all_rots = Rigid(Rotation(rot_mats=all_rots), None)
+    all_frames = default_r.compose(all_rots)
+    chi2_frame_to_frame = all_frames[..., 5]
+    chi3_frame_to_frame = all_frames[..., 6]
+    chi4_frame_to_frame = all_frames[..., 7]
+    chi1_frame_to_bb = all_frames[..., 4]
+    chi2_frame_to_bb = chi1_frame_to_bb.compose(chi2_frame_to_frame)
+    chi3_frame_to_bb = chi2_frame_to_bb.compose(chi3_frame_to_frame)
+    chi4_frame_to_bb = chi3_frame_to_bb.compose(chi4_frame_to_frame)
+    all_frames_to_bb = Rigid.cat(
+        [
+            all_frames[..., :5],
+            chi2_frame_to_bb.unsqueeze(-1),
+            chi3_frame_to_bb.unsqueeze(-1),
+            chi4_frame_to_bb.unsqueeze(-1),
+        ],
+        dim=-1,
+    )
+    all_frames_to_global = r[..., None].compose(all_frames_to_bb)
+    return all_frames_to_global
+def prot_to_torsion_angles(aatype, atom37, atom37_mask):
+    """Calculate torsion angle features from protein features."""
+    prot_feats = {
+        'aatype': aatype,
+        'all_atom_positions': atom37,
+        'all_atom_mask': atom37_mask,
+    }
+    torsion_angles_feats = atom37_to_torsion_angles()(prot_feats)
+    torsion_angles = torsion_angles_feats['torsion_angles_sin_cos']
+    torsion_mask = torsion_angles_feats['torsion_angles_mask']
+    return torsion_angles, torsion_mask
+def frames_to_atom14_pos(
+        r: Rigid,
+        aatype: torch.Tensor,
+    ):
+    """Convert frames to their idealized all atom representation.
+    Args:
+        r: All rigid groups. [..., N, 8, 3]
+        aatype: Residue types. [..., N]
+    Returns:
+    """
+    # [*, N, 14]
+    group_mask = GROUP_IDX[aatype, ...]
+    # [*, N, 14, 8]
+    group_mask = torch.nn.functional.one_hot(
+        group_mask,
+        num_classes=DEFAULT_FRAMES.shape[-3],
+    ).to(r.device)
+    # [*, N, 14, 8]
+    t_atoms_to_global = r[..., None, :] * group_mask
+    # [*, N, 14]
+    t_atoms_to_global = t_atoms_to_global.map_tensor_fn(
+        lambda x: torch.sum(x, dim=-1)
+    )
+    # [*, N, 14, 1]
+    frame_atom_mask = ATOM_MASK[aatype, ...].unsqueeze(-1).to(r.device)
+    # [*, N, 14, 3]
+    frame_null_pos = IDEALIZED_POS[aatype, ...].to(r.device)
+    pred_positions = t_atoms_to_global.apply(frame_null_pos)
+    pred_positions = pred_positions * frame_atom_mask
+    return pred_positions
+def compute_backbone(bb_rigids, psi_torsions, aatype=None, device=None):
+    if device is None:
+        device = bb_rigids.device
+    torsion_angles = torch.tile(
+        psi_torsions[..., None, :],
+        tuple([1 for _ in range(len(bb_rigids.shape))]) + (7, 1)
+    ).to(device)
+    # aatype must be on cpu for initializing the tensor by indexing
+    if aatype is None:
+        aatype = torch.zeros_like(bb_rigids).cpu().long()
+    else:
+        aatype = aatype.cpu()
+    all_frames = torsion_angles_to_frames(
+        bb_rigids,
+        torsion_angles,
+        aatype,
+    )
+    atom14_pos = frames_to_atom14_pos(
+        all_frames,
+        aatype,
+    )
+    atom37_bb_pos = torch.zeros(bb_rigids.shape + (37, 3), device=device)
+    # atom14 bb order = ['N', 'CA', 'C', 'O', 'CB']
+    # atom37 bb order = ['N', 'CA', 'C', 'CB', 'O']
+    atom37_bb_pos[..., :3, :] = atom14_pos[..., :3, :]
+    atom37_bb_pos[..., 3, :] = atom14_pos[..., 4, :]
+    atom37_bb_pos[..., 4, :] = atom14_pos[..., 3, :]
+    atom37_mask = torch.any(atom37_bb_pos, axis=-1)
+    return atom37_bb_pos, atom37_mask, aatype.to(device), atom14_pos
+def calculate_neighbor_angles(R_ac, R_ab):
+    """Calculate angles between atoms c <- a -> b.
+    Parameters
+    ----------
+        R_ac: Tensor, shape = (N,3)
+            Vector from atom a to c.
+        R_ab: Tensor, shape = (N,3)
+            Vector from atom a to b.
+    Returns
+    -------
+        angle_cab: Tensor, shape = (N,)
+            Angle between atoms c <- a -> b.
+    """
+    # cos(alpha) = (u * v) / (|u|*|v|)
+    x = torch.sum(R_ac * R_ab, dim=1)  # shape = (N,)
+    # sin(alpha) = |u x v| / (|u|*|v|)
+    y = torch.cross(R_ac, R_ab).norm(dim=-1)  # shape = (N,)
+    # avoid that for y == (0,0,0) the gradient wrt. y becomes NaN
+    y = torch.max(y, torch.tensor(1e-9))
+    angle = torch.atan2(y, x)
+    return angle
+def vector_projection(R_ab, P_n):
+    """
+    Project the vector R_ab onto a plane with normal vector P_n.
+    Parameters
+    ----------
+        R_ab: Tensor, shape = (N,3)
+            Vector from atom a to b.
+        P_n: Tensor, shape = (N,3)
+            Normal vector of a plane onto which to project R_ab.
+    Returns
+    -------
+        R_ab_proj: Tensor, shape = (N,3)
+            Projected vector (orthogonal to P_n).
+    """
+    a_x_b = torch.sum(R_ab * P_n, dim=-1)
+    b_x_b = torch.sum(P_n * P_n, dim=-1)
+    return R_ab - (a_x_b / b_x_b)[:, None] * P_n

analysis/src/common/data_transforms.py ADDED Viewed

	@@ -0,0 +1,1194 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import itertools
+from functools import reduce, wraps
+from operator import add
+import numpy as np
+import torch
+from src.common import residue_constants as rc
+from src.common.rigid_utils import Rotation, Rigid
+from src.utils.tensor_utils import (
+    tree_map,
+    tensor_tree_map,
+    batched_gather,
+)
+NUM_RES = "num residues placeholder"
+NUM_MSA_SEQ = "msa placeholder"
+NUM_EXTRA_SEQ = "extra msa placeholder"
+NUM_TEMPLATES = "num templates placeholder"
+MSA_FEATURE_NAMES = [
+    "msa",
+    "deletion_matrix",
+    "msa_mask",
+    "msa_row_mask",
+    "bert_mask",
+    "true_msa",
+]
+def cast_to_64bit_ints(protein):
+    # We keep all ints as int64
+    for k, v in protein.items():
+        if v.dtype == torch.int32:
+            protein[k] = v.type(torch.int64)
+    return protein
+def make_one_hot(x, num_classes):
+    x_one_hot = torch.zeros(*x.shape, num_classes)
+    x_one_hot.scatter_(-1, x.unsqueeze(-1), 1)
+    return x_one_hot
+def make_seq_mask(protein):
+    protein["seq_mask"] = torch.ones(
+        protein["aatype"].shape, dtype=torch.float32
+    )
+    return protein
+def make_template_mask(protein):
+    protein["template_mask"] = torch.ones(
+        protein["template_aatype"].shape[0], dtype=torch.float32
+    )
+    return protein
+def curry1(f):
+    """Supply all arguments but the first."""
+    @wraps(f)
+    def fc(*args, **kwargs):
+        return lambda x: f(x, *args, **kwargs)
+    return fc
+def make_all_atom_aatype(protein):
+    protein["all_atom_aatype"] = protein["aatype"]
+    return protein
+def fix_templates_aatype(protein):
+    # Map one-hot to indices
+    num_templates = protein["template_aatype"].shape[0]
+    if(num_templates > 0):
+        protein["template_aatype"] = torch.argmax(
+            protein["template_aatype"], dim=-1
+        )
+        # Map hhsearch-aatype to our aatype.
+        new_order_list = rc.MAP_HHBLITS_AATYPE_TO_OUR_AATYPE
+        new_order = torch.tensor(new_order_list, dtype=torch.int64).expand(
+            num_templates, -1
+        )
+        protein["template_aatype"] = torch.gather(
+            new_order, 1, index=protein["template_aatype"]
+        )
+    return protein
+def correct_msa_restypes(protein):
+    """Correct MSA restype to have the same order as rc."""
+    new_order_list = rc.MAP_HHBLITS_AATYPE_TO_OUR_AATYPE
+    new_order = torch.tensor(
+        [new_order_list] * protein["msa"].shape[1], dtype=protein["msa"].dtype
+    ).transpose(0, 1)
+    protein["msa"] = torch.gather(new_order, 0, protein["msa"])
+    perm_matrix = np.zeros((22, 22), dtype=np.float32)
+    perm_matrix[range(len(new_order_list)), new_order_list] = 1.0
+    for k in protein:
+        if "profile" in k:
+            num_dim = protein[k].shape.as_list()[-1]
+            assert num_dim in [
+                20,
+                21,
+                22,
+            ], "num_dim for %s out of expected range: %s" % (k, num_dim)
+            protein[k] = torch.dot(protein[k], perm_matrix[:num_dim, :num_dim])
+    return protein
+def squeeze_features(protein):
+    """Remove singleton and repeated dimensions in protein features."""
+    protein["aatype"] = torch.argmax(protein["aatype"], dim=-1)
+    for k in [
+        "domain_name",
+        "msa",
+        "num_alignments",
+        "seq_length",
+        "sequence",
+        "superfamily",
+        "deletion_matrix",
+        "resolution",
+        "between_segment_residues",
+        "residue_index",
+        "template_all_atom_mask",
+    ]:
+        if k in protein:
+            final_dim = protein[k].shape[-1]
+            if isinstance(final_dim, int) and final_dim == 1:
+                if torch.is_tensor(protein[k]):
+                    protein[k] = torch.squeeze(protein[k], dim=-1)
+                else:
+                    protein[k] = np.squeeze(protein[k], axis=-1)
+    for k in ["seq_length", "num_alignments"]:
+        if k in protein:
+            protein[k] = protein[k][0]
+    return protein
+@curry1
+def randomly_replace_msa_with_unknown(protein, replace_proportion):
+    """Replace a portion of the MSA with 'X'."""
+    msa_mask = torch.rand(protein["msa"].shape) < replace_proportion
+    x_idx = 20
+    gap_idx = 21
+    msa_mask = torch.logical_and(msa_mask, protein["msa"] != gap_idx)
+    protein["msa"] = torch.where(
+        msa_mask,
+        torch.ones_like(protein["msa"]) * x_idx,
+        protein["msa"]
+    )
+    aatype_mask = torch.rand(protein["aatype"].shape) < replace_proportion
+    protein["aatype"] = torch.where(
+        aatype_mask,
+        torch.ones_like(protein["aatype"]) * x_idx,
+        protein["aatype"],
+    )
+    return protein
+@curry1
+def sample_msa(protein, max_seq, keep_extra, seed=None):
+    """Sample MSA randomly, remaining sequences are stored are stored as `extra_*`."""
+    num_seq = protein["msa"].shape[0]
+    g = torch.Generator(device=protein["msa"].device)
+    if seed is not None:
+        g.manual_seed(seed)
+    shuffled = torch.randperm(num_seq - 1, generator=g) + 1
+    index_order = torch.cat((torch.tensor([0]), shuffled), dim=0)
+    num_sel = min(max_seq, num_seq)
+    sel_seq, not_sel_seq = torch.split(
+        index_order, [num_sel, num_seq - num_sel]
+    )
+    for k in MSA_FEATURE_NAMES:
+        if k in protein:
+            if keep_extra:
+                protein["extra_" + k] = torch.index_select(
+                    protein[k], 0, not_sel_seq
+                )
+            protein[k] = torch.index_select(protein[k], 0, sel_seq)
+    return protein
+@curry1
+def add_distillation_flag(protein, distillation):
+    protein['is_distillation'] = distillation
+    return protein
+@curry1
+def sample_msa_distillation(protein, max_seq):
+    if(protein["is_distillation"] == 1):
+        protein = sample_msa(max_seq, keep_extra=False)(protein)
+    return protein
+@curry1
+def crop_extra_msa(protein, max_extra_msa):
+    num_seq = protein["extra_msa"].shape[0]
+    num_sel = min(max_extra_msa, num_seq)
+    select_indices = torch.randperm(num_seq)[:num_sel]
+    for k in MSA_FEATURE_NAMES:
+        if "extra_" + k in protein:
+            protein["extra_" + k] = torch.index_select(
+                protein["extra_" + k], 0, select_indices
+            )
+    return protein
+def delete_extra_msa(protein):
+    for k in MSA_FEATURE_NAMES:
+        if "extra_" + k in protein:
+            del protein["extra_" + k]
+    return protein
+# Not used in inference
+@curry1
+def block_delete_msa(protein, config):
+    num_seq = protein["msa"].shape[0]
+    block_num_seq = torch.floor(
+        torch.tensor(num_seq, dtype=torch.float32)
+        * config.msa_fraction_per_block
+    ).to(torch.int32)
+    if config.randomize_num_blocks:
+        nb = torch.distributions.uniform.Uniform(
+            0, config.num_blocks + 1
+        ).sample()
+    else:
+        nb = config.num_blocks
+    del_block_starts = torch.distributions.Uniform(0, num_seq).sample(nb)
+    del_blocks = del_block_starts[:, None] + torch.range(block_num_seq)
+    del_blocks = torch.clip(del_blocks, 0, num_seq - 1)
+    del_indices = torch.unique(torch.sort(torch.reshape(del_blocks, [-1])))[0]
+    # Make sure we keep the original sequence
+    combined = torch.cat((torch.range(1, num_seq)[None], del_indices[None]))
+    uniques, counts = combined.unique(return_counts=True)
+    difference = uniques[counts == 1]
+    intersection = uniques[counts > 1]
+    keep_indices = torch.squeeze(difference, 0)
+    for k in MSA_FEATURE_NAMES:
+        if k in protein:
+            protein[k] = torch.gather(protein[k], keep_indices)
+    return protein
+@curry1
+def nearest_neighbor_clusters(protein, gap_agreement_weight=0.0):
+    weights = torch.cat(
+        [torch.ones(21), gap_agreement_weight * torch.ones(1), torch.zeros(1)],
+        0,
+    )
+    # Make agreement score as weighted Hamming distance
+    msa_one_hot = make_one_hot(protein["msa"], 23)
+    sample_one_hot = protein["msa_mask"][:, :, None] * msa_one_hot
+    extra_msa_one_hot = make_one_hot(protein["extra_msa"], 23)
+    extra_one_hot = protein["extra_msa_mask"][:, :, None] * extra_msa_one_hot
+    num_seq, num_res, _ = sample_one_hot.shape
+    extra_num_seq, _, _ = extra_one_hot.shape
+    # Compute tf.einsum('mrc,nrc,c->mn', sample_one_hot, extra_one_hot, weights)
+    # in an optimized fashion to avoid possible memory or computation blowup.
+    agreement = torch.matmul(
+        torch.reshape(extra_one_hot, [extra_num_seq, num_res * 23]),
+        torch.reshape(
+            sample_one_hot * weights, [num_seq, num_res * 23]
+        ).transpose(0, 1),
+    )
+    # Assign each sequence in the extra sequences to the closest MSA sample
+    protein["extra_cluster_assignment"] = torch.argmax(agreement, dim=1).to(
+        torch.int64
+    )
+    return protein
+def unsorted_segment_sum(data, segment_ids, num_segments):
+    """
+    Computes the sum along segments of a tensor. Similar to
+    tf.unsorted_segment_sum, but only supports 1-D indices.
+    :param data: A tensor whose segments are to be summed.
+    :param segment_ids: The 1-D segment indices tensor.
+    :param num_segments: The number of segments.
+    :return: A tensor of same data type as the data argument.
+    """
+    assert (
+        len(segment_ids.shape) == 1 and
+        segment_ids.shape[0] == data.shape[0]
+    )
+    segment_ids = segment_ids.view(
+        segment_ids.shape[0], *((1,) * len(data.shape[1:]))
+    )
+    segment_ids = segment_ids.expand(data.shape)
+    shape = [num_segments] + list(data.shape[1:])
+    tensor = torch.zeros(*shape).scatter_add_(0, segment_ids, data.float())
+    tensor = tensor.type(data.dtype)
+    return tensor
+@curry1
+def summarize_clusters(protein):
+    """Produce profile and deletion_matrix_mean within each cluster."""
+    num_seq = protein["msa"].shape[0]
+    def csum(x):
+        return unsorted_segment_sum(
+            x, protein["extra_cluster_assignment"], num_seq
+        )
+    mask = protein["extra_msa_mask"]
+    mask_counts = 1e-6 + protein["msa_mask"] + csum(mask)  # Include center
+    msa_sum = csum(mask[:, :, None] * make_one_hot(protein["extra_msa"], 23))
+    msa_sum += make_one_hot(protein["msa"], 23)  # Original sequence
+    protein["cluster_profile"] = msa_sum / mask_counts[:, :, None]
+    del msa_sum
+    del_sum = csum(mask * protein["extra_deletion_matrix"])
+    del_sum += protein["deletion_matrix"]  # Original sequence
+    protein["cluster_deletion_mean"] = del_sum / mask_counts
+    del del_sum
+    return protein
+def make_msa_mask(protein):
+    """Mask features are all ones, but will later be zero-padded."""
+    protein["msa_mask"] = torch.ones(protein["msa"].shape, dtype=torch.float32)
+    protein["msa_row_mask"] = torch.ones(
+        (protein["msa"].shape[0]), dtype=torch.float32
+    )
+    return protein
+def pseudo_beta_fn(aatype, all_atom_positions, all_atom_mask):
+    """Create pseudo beta features."""
+    is_gly = torch.eq(aatype, rc.restype_order["G"])
+    ca_idx = rc.atom_order["CA"]
+    cb_idx = rc.atom_order["CB"]
+    pseudo_beta = torch.where(
+        torch.tile(is_gly[..., None], [1] * len(is_gly.shape) + [3]),
+        all_atom_positions[..., ca_idx, :],
+        all_atom_positions[..., cb_idx, :],
+    )
+    if all_atom_mask is not None:
+        pseudo_beta_mask = torch.where(
+            is_gly, all_atom_mask[..., ca_idx], all_atom_mask[..., cb_idx]
+        )
+        return pseudo_beta, pseudo_beta_mask
+    else:
+        return pseudo_beta
+@curry1
+def make_pseudo_beta(protein, prefix=""):
+    """Create pseudo-beta (alpha for glycine) position and mask."""
+    assert prefix in ["", "template_"]
+    (
+        protein[prefix + "pseudo_beta"],
+        protein[prefix + "pseudo_beta_mask"],
+    ) = pseudo_beta_fn(
+        protein["template_aatype" if prefix else "aatype"],
+        protein[prefix + "all_atom_positions"],
+        protein["template_all_atom_mask" if prefix else "all_atom_mask"],
+    )
+    return protein
+@curry1
+def add_constant_field(protein, key, value):
+    protein[key] = torch.tensor(value)
+    return protein
+def shaped_categorical(probs, epsilon=1e-10):
+    ds = probs.shape
+    num_classes = ds[-1]
+    distribution = torch.distributions.categorical.Categorical(
+        torch.reshape(probs + epsilon, [-1, num_classes])
+    )
+    counts = distribution.sample()
+    return torch.reshape(counts, ds[:-1])
+def make_hhblits_profile(protein):
+    """Compute the HHblits MSA profile if not already present."""
+    if "hhblits_profile" in protein:
+        return protein
+    # Compute the profile for every residue (over all MSA sequences).
+    msa_one_hot = make_one_hot(protein["msa"], 22)
+    protein["hhblits_profile"] = torch.mean(msa_one_hot, dim=0)
+    return protein
+@curry1
+def make_masked_msa(protein, config, replace_fraction):
+    """Create data for BERT on raw MSA."""
+    # Add a random amino acid uniformly.
+    random_aa = torch.tensor([0.05] * 20 + [0.0, 0.0], dtype=torch.float32)
+    categorical_probs = (
+        config.uniform_prob * random_aa
+        + config.profile_prob * protein["hhblits_profile"]
+        + config.same_prob * make_one_hot(protein["msa"], 22)
+    )
+    # Put all remaining probability on [MASK] which is a new column
+    pad_shapes = list(
+        reduce(add, [(0, 0) for _ in range(len(categorical_probs.shape))])
+    )
+    pad_shapes[1] = 1
+    mask_prob = (
+        1.0 - config.profile_prob - config.same_prob - config.uniform_prob
+    )
+    assert mask_prob >= 0.0
+    categorical_probs = torch.nn.functional.pad(
+        categorical_probs, pad_shapes, value=mask_prob
+    )
+    sh = protein["msa"].shape
+    mask_position = torch.rand(sh) < replace_fraction
+    bert_msa = shaped_categorical(categorical_probs)
+    bert_msa = torch.where(mask_position, bert_msa, protein["msa"])
+    # Mix real and masked MSA
+    protein["bert_mask"] = mask_position.to(torch.float32)
+    protein["true_msa"] = protein["msa"]
+    protein["msa"] = bert_msa
+    return protein
+@curry1
+def make_fixed_size(
+    protein,
+    shape_schema,
+    msa_cluster_size,
+    extra_msa_size,
+    num_res=0,
+    num_templates=0,
+):
+    """Guess at the MSA and sequence dimension to make fixed size."""
+    pad_size_map = {
+        NUM_RES: num_res,
+        NUM_MSA_SEQ: msa_cluster_size,
+        NUM_EXTRA_SEQ: extra_msa_size,
+        NUM_TEMPLATES: num_templates,
+    }
+    for k, v in protein.items():
+        # Don't transfer this to the accelerator.
+        if k == "extra_cluster_assignment":
+            continue
+        shape = list(v.shape)
+        schema = shape_schema[k]
+        msg = "Rank mismatch between shape and shape schema for"
+        assert len(shape) == len(schema), f"{msg} {k}: {shape} vs {schema}"
+        pad_size = [
+            pad_size_map.get(s2, None) or s1 for (s1, s2) in zip(shape, schema)
+        ]
+        padding = [(0, p - v.shape[i]) for i, p in enumerate(pad_size)]
+        padding.reverse()
+        padding = list(itertools.chain(*padding))
+        if padding:
+            protein[k] = torch.nn.functional.pad(v, padding)
+            protein[k] = torch.reshape(protein[k], pad_size)
+    return protein
+@curry1
+def make_msa_feat(protein):
+    """Create and concatenate MSA features."""
+    # Whether there is a domain break. Always zero for chains, but keeping for
+    # compatibility with domain datasets.
+    has_break = torch.clip(
+        protein["between_segment_residues"].to(torch.float32), 0, 1
+    )
+    aatype_1hot = make_one_hot(protein["aatype"], 21)
+    target_feat = [
+        torch.unsqueeze(has_break, dim=-1),
+        aatype_1hot,  # Everyone gets the original sequence.
+    ]
+    msa_1hot = make_one_hot(protein["msa"], 23)
+    has_deletion = torch.clip(protein["deletion_matrix"], 0.0, 1.0)
+    deletion_value = torch.atan(protein["deletion_matrix"] / 3.0) * (
+        2.0 / np.pi
+    )
+    msa_feat = [
+        msa_1hot,
+        torch.unsqueeze(has_deletion, dim=-1),
+        torch.unsqueeze(deletion_value, dim=-1),
+    ]
+    if "cluster_profile" in protein:
+        deletion_mean_value = torch.atan(
+            protein["cluster_deletion_mean"] / 3.0
+        ) * (2.0 / np.pi)
+        msa_feat.extend(
+            [
+                protein["cluster_profile"],
+                torch.unsqueeze(deletion_mean_value, dim=-1),
+            ]
+        )
+    if "extra_deletion_matrix" in protein:
+        protein["extra_has_deletion"] = torch.clip(
+            protein["extra_deletion_matrix"], 0.0, 1.0
+        )
+        protein["extra_deletion_value"] = torch.atan(
+            protein["extra_deletion_matrix"] / 3.0
+        ) * (2.0 / np.pi)
+    protein["msa_feat"] = torch.cat(msa_feat, dim=-1)
+    protein["target_feat"] = torch.cat(target_feat, dim=-1)
+    return protein
+@curry1
+def select_feat(protein, feature_list):
+    return {k: v for k, v in protein.items() if k in feature_list}
+@curry1
+def crop_templates(protein, max_templates):
+    for k, v in protein.items():
+        if k.startswith("template_"):
+            protein[k] = v[:max_templates]
+    return protein
+def make_atom14_masks(protein):
+    """Construct denser atom positions (14 dimensions instead of 37)."""
+    restype_atom14_to_atom37 = []
+    restype_atom37_to_atom14 = []
+    restype_atom14_mask = []
+    for rt in rc.restypes:
+        atom_names = rc.restype_name_to_atom14_names[rc.restype_1to3[rt]]
+        restype_atom14_to_atom37.append(
+            [(rc.atom_order[name] if name else 0) for name in atom_names]
+        )
+        atom_name_to_idx14 = {name: i for i, name in enumerate(atom_names)}
+        restype_atom37_to_atom14.append(
+            [
+                (atom_name_to_idx14[name] if name in atom_name_to_idx14 else 0)
+                for name in rc.atom_types
+            ]
+        )
+        restype_atom14_mask.append(
+            [(1.0 if name else 0.0) for name in atom_names]
+        )
+    # Add dummy mapping for restype 'UNK'
+    restype_atom14_to_atom37.append([0] * 14)
+    restype_atom37_to_atom14.append([0] * 37)
+    restype_atom14_mask.append([0.0] * 14)
+    restype_atom14_to_atom37 = torch.tensor(
+        restype_atom14_to_atom37,
+        dtype=torch.int32,
+        device=protein["aatype"].device,
+    )
+    restype_atom37_to_atom14 = torch.tensor(
+        restype_atom37_to_atom14,
+        dtype=torch.int32,
+        device=protein["aatype"].device,
+    )
+    restype_atom14_mask = torch.tensor(
+        restype_atom14_mask,
+        dtype=torch.float32,
+        device=protein["aatype"].device,
+    )
+    protein_aatype = protein['aatype'].to(torch.long)
+    # create the mapping for (residx, atom14) --> atom37, i.e. an array
+    # with shape (num_res, 14) containing the atom37 indices for this protein
+    residx_atom14_to_atom37 = restype_atom14_to_atom37[protein_aatype]
+    residx_atom14_mask = restype_atom14_mask[protein_aatype]
+    protein["atom14_atom_exists"] = residx_atom14_mask
+    protein["residx_atom14_to_atom37"] = residx_atom14_to_atom37.long()
+    # create the gather indices for mapping back
+    residx_atom37_to_atom14 = restype_atom37_to_atom14[protein_aatype]
+    protein["residx_atom37_to_atom14"] = residx_atom37_to_atom14.long()
+    # create the corresponding mask
+    restype_atom37_mask = torch.zeros(
+        [21, 37], dtype=torch.float32, device=protein["aatype"].device
+    )
+    for restype, restype_letter in enumerate(rc.restypes):
+        restype_name = rc.restype_1to3[restype_letter]
+        atom_names = rc.residue_atoms[restype_name]
+        for atom_name in atom_names:
+            atom_type = rc.atom_order[atom_name]
+            restype_atom37_mask[restype, atom_type] = 1
+    residx_atom37_mask = restype_atom37_mask[protein_aatype]
+    protein["atom37_atom_exists"] = residx_atom37_mask
+    return protein
+def make_atom14_masks_np(batch):
+    batch = tree_map(lambda n: torch.tensor(n), batch, np.ndarray)
+    out = make_atom14_masks(batch)
+    out = tensor_tree_map(lambda t: np.array(t), out)
+    return out
+def make_atom14_positions(protein):
+    """Constructs denser atom positions (14 dimensions instead of 37)."""
+    residx_atom14_mask = protein["atom14_atom_exists"]
+    residx_atom14_to_atom37 = protein["residx_atom14_to_atom37"]
+    # Create a mask for known ground truth positions.
+    residx_atom14_gt_mask = residx_atom14_mask * batched_gather(
+        protein["all_atom_mask"],
+        residx_atom14_to_atom37,
+        dim=-1,
+        no_batch_dims=len(protein["all_atom_mask"].shape[:-1]),
+    )
+    # Gather the ground truth positions.
+    residx_atom14_gt_positions = residx_atom14_gt_mask[..., None] * (
+        batched_gather(
+            protein["all_atom_positions"],
+            residx_atom14_to_atom37,
+            dim=-2,
+            no_batch_dims=len(protein["all_atom_positions"].shape[:-2]),
+        )
+    )
+    protein["atom14_atom_exists"] = residx_atom14_mask
+    protein["atom14_gt_exists"] = residx_atom14_gt_mask
+    protein["atom14_gt_positions"] = residx_atom14_gt_positions
+    # As the atom naming is ambiguous for 7 of the 20 amino acids, provide
+    # alternative ground truth coordinates where the naming is swapped
+    restype_3 = [rc.restype_1to3[res] for res in rc.restypes]
+    restype_3 += ["UNK"]
+    # Matrices for renaming ambiguous atoms.
+    all_matrices = {
+        res: torch.eye(
+            14,
+            dtype=protein["all_atom_mask"].dtype,
+            device=protein["all_atom_mask"].device,
+        )
+        for res in restype_3
+    }
+    for resname, swap in rc.residue_atom_renaming_swaps.items():
+        correspondences = torch.arange(
+            14, device=protein["all_atom_mask"].device
+        )
+        for source_atom_swap, target_atom_swap in swap.items():
+            source_index = rc.restype_name_to_atom14_names[resname].index(
+                source_atom_swap
+            )
+            target_index = rc.restype_name_to_atom14_names[resname].index(
+                target_atom_swap
+            )
+            correspondences[source_index] = target_index
+            correspondences[target_index] = source_index
+            renaming_matrix = protein["all_atom_mask"].new_zeros((14, 14))
+            for index, correspondence in enumerate(correspondences):
+                renaming_matrix[index, correspondence] = 1.0
+        all_matrices[resname] = renaming_matrix
+    renaming_matrices = torch.stack(
+        [all_matrices[restype] for restype in restype_3]
+    )
+    # Pick the transformation matrices for the given residue sequence
+    # shape (num_res, 14, 14).
+    renaming_transform = renaming_matrices[protein["aatype"]]
+    # Apply it to the ground truth positions. shape (num_res, 14, 3).
+    alternative_gt_positions = torch.einsum(
+        "...rac,...rab->...rbc", residx_atom14_gt_positions, renaming_transform
+    )
+    protein["atom14_alt_gt_positions"] = alternative_gt_positions
+    # Create the mask for the alternative ground truth (differs from the
+    # ground truth mask, if only one of the atoms in an ambiguous pair has a
+    # ground truth position).
+    alternative_gt_mask = torch.einsum(
+        "...ra,...rab->...rb", residx_atom14_gt_mask, renaming_transform
+    )
+    protein["atom14_alt_gt_exists"] = alternative_gt_mask
+    # Create an ambiguous atoms mask.  shape: (21, 14).
+    restype_atom14_is_ambiguous = protein["all_atom_mask"].new_zeros((21, 14))
+    for resname, swap in rc.residue_atom_renaming_swaps.items():
+        for atom_name1, atom_name2 in swap.items():
+            restype = rc.restype_order[rc.restype_3to1[resname]]
+            atom_idx1 = rc.restype_name_to_atom14_names[resname].index(
+                atom_name1
+            )
+            atom_idx2 = rc.restype_name_to_atom14_names[resname].index(
+                atom_name2
+            )
+            restype_atom14_is_ambiguous[restype, atom_idx1] = 1
+            restype_atom14_is_ambiguous[restype, atom_idx2] = 1
+    # From this create an ambiguous_mask for the given sequence.
+    protein["atom14_atom_is_ambiguous"] = restype_atom14_is_ambiguous[
+        protein["aatype"]
+    ]
+    return protein
+def atom37_to_frames(protein, eps=1e-8):
+    aatype = protein["aatype"]
+    all_atom_positions = protein["all_atom_positions"]
+    all_atom_mask = protein["all_atom_mask"]
+    batch_dims = len(aatype.shape[:-1])
+    restype_rigidgroup_base_atom_names = np.full([21, 8, 3], "", dtype=object)
+    restype_rigidgroup_base_atom_names[:, 0, :] = ["C", "CA", "N"]
+    restype_rigidgroup_base_atom_names[:, 3, :] = ["CA", "C", "O"]
+    for restype, restype_letter in enumerate(rc.restypes):
+        resname = rc.restype_1to3[restype_letter]
+        for chi_idx in range(4):
+            if rc.chi_angles_mask[restype][chi_idx]:
+                names = rc.chi_angles_atoms[resname][chi_idx]
+                restype_rigidgroup_base_atom_names[
+                    restype, chi_idx + 4, :
+                ] = names[1:]
+    restype_rigidgroup_mask = all_atom_mask.new_zeros(
+        (*aatype.shape[:-1], 21, 8),
+    )
+    restype_rigidgroup_mask[..., 0] = 1
+    restype_rigidgroup_mask[..., 3] = 1
+    restype_rigidgroup_mask[..., :20, 4:] = all_atom_mask.new_tensor(
+        rc.chi_angles_mask
+    )
+    lookuptable = rc.atom_order.copy()
+    lookuptable[""] = 0
+    lookup = np.vectorize(lambda x: lookuptable[x])
+    restype_rigidgroup_base_atom37_idx = lookup(
+        restype_rigidgroup_base_atom_names,
+    )
+    restype_rigidgroup_base_atom37_idx = aatype.new_tensor(
+        restype_rigidgroup_base_atom37_idx,
+    )
+    restype_rigidgroup_base_atom37_idx = (
+        restype_rigidgroup_base_atom37_idx.view(
+            *((1,) * batch_dims), *restype_rigidgroup_base_atom37_idx.shape
+        )
+    )
+    residx_rigidgroup_base_atom37_idx = batched_gather(
+        restype_rigidgroup_base_atom37_idx,
+        aatype,
+        dim=-3,
+        no_batch_dims=batch_dims,
+    )
+    base_atom_pos = batched_gather(
+        all_atom_positions,
+        residx_rigidgroup_base_atom37_idx,
+        dim=-2,
+        no_batch_dims=len(all_atom_positions.shape[:-2]),
+    )
+    gt_frames = Rigid.from_3_points(
+        p_neg_x_axis=base_atom_pos[..., 0, :],
+        origin=base_atom_pos[..., 1, :],
+        p_xy_plane=base_atom_pos[..., 2, :],
+        eps=eps,
+    )
+    group_exists = batched_gather(
+        restype_rigidgroup_mask,
+        aatype,
+        dim=-2,
+        no_batch_dims=batch_dims,
+    )
+    gt_atoms_exist = batched_gather(
+        all_atom_mask,
+        residx_rigidgroup_base_atom37_idx,
+        dim=-1,
+        no_batch_dims=len(all_atom_mask.shape[:-1]),
+    )
+    gt_exists = torch.min(gt_atoms_exist, dim=-1)[0] * group_exists
+    rots = torch.eye(3, dtype=all_atom_mask.dtype, device=aatype.device)
+    rots = torch.tile(rots, (*((1,) * batch_dims), 8, 1, 1))
+    rots[..., 0, 0, 0] = -1
+    rots[..., 0, 2, 2] = -1
+    rots = Rotation(rot_mats=rots)
+    gt_frames = gt_frames.compose(Rigid(rots, None))
+    restype_rigidgroup_is_ambiguous = all_atom_mask.new_zeros(
+        *((1,) * batch_dims), 21, 8
+    )
+    restype_rigidgroup_rots = torch.eye(
+        3, dtype=all_atom_mask.dtype, device=aatype.device
+    )
+    restype_rigidgroup_rots = torch.tile(
+        restype_rigidgroup_rots,
+        (*((1,) * batch_dims), 21, 8, 1, 1),
+    )
+    for resname, _ in rc.residue_atom_renaming_swaps.items():
+        restype = rc.restype_order[rc.restype_3to1[resname]]
+        chi_idx = int(sum(rc.chi_angles_mask[restype]) - 1)
+        restype_rigidgroup_is_ambiguous[..., restype, chi_idx + 4] = 1
+        restype_rigidgroup_rots[..., restype, chi_idx + 4, 1, 1] = -1
+        restype_rigidgroup_rots[..., restype, chi_idx + 4, 2, 2] = -1
+    residx_rigidgroup_is_ambiguous = batched_gather(
+        restype_rigidgroup_is_ambiguous,
+        aatype,
+        dim=-2,
+        no_batch_dims=batch_dims,
+    )
+    residx_rigidgroup_ambiguity_rot = batched_gather(
+        restype_rigidgroup_rots,
+        aatype,
+        dim=-4,
+        no_batch_dims=batch_dims,
+    )
+    residx_rigidgroup_ambiguity_rot = Rotation(
+        rot_mats=residx_rigidgroup_ambiguity_rot
+    )
+    alt_gt_frames = gt_frames.compose(
+        Rigid(residx_rigidgroup_ambiguity_rot, None)
+    )
+    gt_frames_tensor = gt_frames.to_tensor_4x4()
+    alt_gt_frames_tensor = alt_gt_frames.to_tensor_4x4()
+    protein["rigidgroups_gt_frames"] = gt_frames_tensor
+    protein["rigidgroups_gt_exists"] = gt_exists
+    protein["rigidgroups_group_exists"] = group_exists
+    protein["rigidgroups_group_is_ambiguous"] = residx_rigidgroup_is_ambiguous
+    protein["rigidgroups_alt_gt_frames"] = alt_gt_frames_tensor
+    return protein
+def get_chi_atom_indices():
+    """Returns atom indices needed to compute chi angles for all residue types.
+    Returns:
+      A tensor of shape [residue_types=21, chis=4, atoms=4]. The residue types are
+      in the order specified in rc.restypes + unknown residue type
+      at the end. For chi angles which are not defined on the residue, the
+      positions indices are by default set to 0.
+    """
+    chi_atom_indices = []
+    for residue_name in rc.restypes:
+        residue_name = rc.restype_1to3[residue_name]
+        residue_chi_angles = rc.chi_angles_atoms[residue_name]
+        atom_indices = []
+        for chi_angle in residue_chi_angles:
+            atom_indices.append([rc.atom_order[atom] for atom in chi_angle])
+        for _ in range(4 - len(atom_indices)):
+            atom_indices.append(
+                [0, 0, 0, 0]
+            )  # For chi angles not defined on the AA.
+        chi_atom_indices.append(atom_indices)
+    chi_atom_indices.append([[0, 0, 0, 0]] * 4)  # For UNKNOWN residue.
+    return chi_atom_indices
+@curry1
+def atom37_to_torsion_angles(
+    protein,
+    prefix="",
+):
+    """
+    Convert coordinates to torsion angles.
+    This function is extremely sensitive to floating point imprecisions
+    and should be run with double precision whenever possible.
+    Args:
+        Dict containing:
+            * (prefix)aatype:
+                [*, N_res] residue indices
+            * (prefix)all_atom_positions:
+                [*, N_res, 37, 3] atom positions (in atom37
+                format)
+            * (prefix)all_atom_mask:
+                [*, N_res, 37] atom position mask
+    Returns:
+        The same dictionary updated with the following features:
+        "(prefix)torsion_angles_sin_cos" ([*, N_res, 7, 2])
+            Torsion angles
+        "(prefix)alt_torsion_angles_sin_cos" ([*, N_res, 7, 2])
+            Alternate torsion angles (accounting for 180-degree symmetry)
+        "(prefix)torsion_angles_mask" ([*, N_res, 7])
+            Torsion angles mask
+    """
+    aatype = protein[prefix + "aatype"]
+    all_atom_positions = protein[prefix + "all_atom_positions"]
+    all_atom_mask = protein[prefix + "all_atom_mask"]
+    aatype = torch.clamp(aatype, max=20)
+    pad = all_atom_positions.new_zeros(
+        [*all_atom_positions.shape[:-3], 1, 37, 3]
+    )
+    prev_all_atom_positions = torch.cat(
+        [pad, all_atom_positions[..., :-1, :, :]], dim=-3
+    )
+    pad = all_atom_mask.new_zeros([*all_atom_mask.shape[:-2], 1, 37])
+    prev_all_atom_mask = torch.cat([pad, all_atom_mask[..., :-1, :]], dim=-2)
+    pre_omega_atom_pos = torch.cat(
+        [prev_all_atom_positions[..., 1:3, :], all_atom_positions[..., :2, :]],
+        dim=-2,
+    )
+    phi_atom_pos = torch.cat(
+        [prev_all_atom_positions[..., 2:3, :], all_atom_positions[..., :3, :]],
+        dim=-2,
+    )
+    psi_atom_pos = torch.cat(
+        [all_atom_positions[..., :3, :], all_atom_positions[..., 4:5, :]],
+        dim=-2,
+    )
+    pre_omega_mask = torch.prod(
+        prev_all_atom_mask[..., 1:3], dim=-1
+    ) * torch.prod(all_atom_mask[..., :2], dim=-1)
+    phi_mask = prev_all_atom_mask[..., 2] * torch.prod(
+        all_atom_mask[..., :3], dim=-1, dtype=all_atom_mask.dtype
+    )
+    psi_mask = (
+        torch.prod(all_atom_mask[..., :3], dim=-1, dtype=all_atom_mask.dtype)
+        * all_atom_mask[..., 4]
+    )
+    chi_atom_indices = torch.as_tensor(
+        get_chi_atom_indices(), device=aatype.device
+    )
+    atom_indices = chi_atom_indices[..., aatype, :, :]
+    chis_atom_pos = batched_gather(
+        all_atom_positions, atom_indices, -2, len(atom_indices.shape[:-2])
+    )
+    chi_angles_mask = list(rc.chi_angles_mask)
+    chi_angles_mask.append([0.0, 0.0, 0.0, 0.0])
+    chi_angles_mask = all_atom_mask.new_tensor(chi_angles_mask)
+    chis_mask = chi_angles_mask[aatype, :]
+    chi_angle_atoms_mask = batched_gather(
+        all_atom_mask,
+        atom_indices,
+        dim=-1,
+        no_batch_dims=len(atom_indices.shape[:-2]),
+    )
+    chi_angle_atoms_mask = torch.prod(
+        chi_angle_atoms_mask, dim=-1, dtype=chi_angle_atoms_mask.dtype
+    )
+    chis_mask = chis_mask * chi_angle_atoms_mask
+    torsions_atom_pos = torch.cat(
+        [
+            pre_omega_atom_pos[..., None, :, :],
+            phi_atom_pos[..., None, :, :],
+            psi_atom_pos[..., None, :, :],
+            chis_atom_pos,
+        ],
+        dim=-3,
+    )
+    torsion_angles_mask = torch.cat(
+        [
+            pre_omega_mask[..., None],
+            phi_mask[..., None],
+            psi_mask[..., None],
+            chis_mask,
+        ],
+        dim=-1,
+    )
+    torsion_frames = Rigid.from_3_points(
+        torsions_atom_pos[..., 1, :],
+        torsions_atom_pos[..., 2, :],
+        torsions_atom_pos[..., 0, :],
+        eps=1e-8,
+    )
+    fourth_atom_rel_pos = torsion_frames.invert().apply(
+        torsions_atom_pos[..., 3, :]
+    )
+    torsion_angles_sin_cos = torch.stack(
+        [fourth_atom_rel_pos[..., 2], fourth_atom_rel_pos[..., 1]], dim=-1
+    )
+    denom = torch.sqrt(
+        torch.sum(
+            torch.square(torsion_angles_sin_cos),
+            dim=-1,
+            dtype=torsion_angles_sin_cos.dtype,
+            keepdims=True,
+        )
+        + 1e-8
+    )
+    torsion_angles_sin_cos = torsion_angles_sin_cos / denom
+    torsion_angles_sin_cos = torsion_angles_sin_cos * all_atom_mask.new_tensor(
+        [1.0, 1.0, -1.0, 1.0, 1.0, 1.0, 1.0],
+    )[((None,) * len(torsion_angles_sin_cos.shape[:-2])) + (slice(None), None)]
+    chi_is_ambiguous = torsion_angles_sin_cos.new_tensor(
+        rc.chi_pi_periodic,
+    )[aatype, ...]
+    mirror_torsion_angles = torch.cat(
+        [
+            all_atom_mask.new_ones(*aatype.shape, 3),
+            1.0 - 2.0 * chi_is_ambiguous,
+        ],
+        dim=-1,
+    )
+    alt_torsion_angles_sin_cos = (
+        torsion_angles_sin_cos * mirror_torsion_angles[..., None]
+    )
+    protein[prefix + "torsion_angles_sin_cos"] = torsion_angles_sin_cos
+    protein[prefix + "alt_torsion_angles_sin_cos"] = alt_torsion_angles_sin_cos
+    protein[prefix + "torsion_angles_mask"] = torsion_angles_mask
+    return protein
+def get_backbone_frames(protein):
+    # DISCREPANCY: AlphaFold uses tensor_7s here. I don't know why.
+    protein["backbone_rigid_tensor"] = protein["rigidgroups_gt_frames"][
+        ..., 0, :, :
+    ]
+    protein["backbone_rigid_mask"] = protein["rigidgroups_gt_exists"][..., 0]
+    return protein
+def get_chi_angles(protein):
+    dtype = protein["all_atom_mask"].dtype
+    protein["chi_angles_sin_cos"] = (
+        protein["torsion_angles_sin_cos"][..., 3:, :]
+    ).to(dtype)
+    protein["chi_mask"] = protein["torsion_angles_mask"][..., 3:].to(dtype)
+    return protein
+@curry1
+def random_crop_to_size(
+    protein,
+    crop_size,
+    max_templates,
+    shape_schema,
+    subsample_templates=False,
+    seed=None,
+):
+    """Crop randomly to `crop_size`, or keep as is if shorter than that."""
+    # We want each ensemble to be cropped the same way
+    g = torch.Generator(device=protein["seq_length"].device)
+    if seed is not None:
+        g.manual_seed(seed)
+    seq_length = protein["seq_length"]
+    if "template_mask" in protein:
+        num_templates = protein["template_mask"].shape[-1]
+    else:
+        num_templates = 0
+    # No need to subsample templates if there aren't any
+    subsample_templates = subsample_templates and num_templates
+    num_res_crop_size = min(int(seq_length), crop_size)
+    def _randint(lower, upper):
+        return int(torch.randint(
+                lower,
+                upper + 1,
+                (1,),
+                device=protein["seq_length"].device,
+                generator=g,
+        )[0])
+    if subsample_templates:
+        templates_crop_start = _randint(0, num_templates)
+        templates_select_indices = torch.randperm(
+            num_templates, device=protein["seq_length"].device, generator=g
+        )
+    else:
+        templates_crop_start = 0
+    num_templates_crop_size = min(
+        num_templates - templates_crop_start, max_templates
+    )
+    n = seq_length - num_res_crop_size
+    if "use_clamped_fape" in protein and protein["use_clamped_fape"] == 1.:
+        right_anchor = n
+    else:
+        x = _randint(0, n)
+        right_anchor = n - x
+    num_res_crop_start = _randint(0, right_anchor)
+    for k, v in protein.items():
+        if k not in shape_schema or (
+            "template" not in k and NUM_RES not in shape_schema[k]
+        ):
+            continue
+        # randomly permute the templates before cropping them.
+        if k.startswith("template") and subsample_templates:
+            v = v[templates_select_indices]
+        slices = []
+        for i, (dim_size, dim) in enumerate(zip(shape_schema[k], v.shape)):
+            is_num_res = dim_size == NUM_RES
+            if i == 0 and k.startswith("template"):
+                crop_size = num_templates_crop_size
+                crop_start = templates_crop_start
+            else:
+                crop_start = num_res_crop_start if is_num_res else 0
+                crop_size = num_res_crop_size if is_num_res else dim
+            slices.append(slice(crop_start, crop_start + crop_size))
+        protein[k] = v[slices]
+    protein["seq_length"] = protein["seq_length"].new_tensor(num_res_crop_size)
+    return protein

analysis/src/common/geo_utils.py ADDED Viewed

	@@ -0,0 +1,155 @@

+"""
+Utility functions for geometric operations (torch only).
+"""
+import torch
+def rots_mul_vecs(m, v):
+  """(Batch) Apply rotations 'm' to vectors 'v'."""
+  return torch.stack([
+        m[..., 0, 0] * v[..., 0] + m[..., 0, 1] * v[..., 1] + m[..., 0, 2] * v[..., 2],
+        m[..., 1, 0] * v[..., 0] + m[..., 1, 1] * v[..., 1] + m[..., 1, 2] * v[..., 2],
+        m[..., 2, 0] * v[..., 0] + m[..., 2, 1] * v[..., 1] + m[..., 2, 2] * v[..., 2],
+  ], dim=-1)
+def distance(p, eps=1e-10):
+    """Calculate distance between a pair of points (dim=-2)."""
+    # [*, 2, 3]
+    return (eps + torch.sum((p[..., 0, :] - p[..., 1, :]) ** 2, dim=-1)) ** 0.5
+def dihedral(p, eps=1e-10):
+    """Calculate dihedral angle between a quadruple of points (dim=-2)."""
+    # p: [*, 4, 3]
+    # [*, 3]
+    u1 = p[..., 1, :] - p[..., 0, :]
+    u2 = p[..., 2, :] - p[..., 1, :]
+    u3 = p[..., 3, :] - p[..., 2, :]
+    # [*, 3]
+    u1xu2 = torch.cross(u1, u2, dim=-1)
+    u2xu3 = torch.cross(u2, u3, dim=-1)
+    # [*]
+    u2_norm = (eps + torch.sum(u2 ** 2, dim=-1)) ** 0.5
+    u1xu2_norm = (eps + torch.sum(u1xu2 ** 2, dim=-1)) ** 0.5
+    u2xu3_norm = (eps + torch.sum(u2xu3 ** 2, dim=-1)) ** 0.5
+    # [*]
+    cos_enc = torch.einsum('...d,...d->...', u1xu2, u2xu3)/ (u1xu2_norm * u2xu3_norm)
+    sin_enc = torch.einsum('...d,...d->...', u2, torch.cross(u1xu2, u2xu3, dim=-1)) /  (u2_norm * u1xu2_norm * u2xu3_norm)
+    return torch.stack([cos_enc, sin_enc], dim=-1)
+def calc_distogram(pos: torch.Tensor, min_bin: float, max_bin: float, num_bins: int):
+    # pos: [*, L, 3]
+    dists_2d = torch.linalg.norm(
+        pos[..., :, None, :] - pos[..., None, :, :], axis=-1
+    )[..., None]
+    lower = torch.linspace(
+        min_bin,
+        max_bin,
+        num_bins,
+        device=pos.device)
+    upper = torch.cat([lower[1:], lower.new_tensor([1e8])], dim=-1)
+    distogram = ((dists_2d > lower) * (dists_2d < upper)).type(pos.dtype)
+    return distogram
+def rmsd(xyz1, xyz2):
+    """ Abbreviation for squared_deviation(xyz1, xyz2, 'rmsd') """
+    return squared_deviation(xyz1, xyz2, 'rmsd')
+def squared_deviation(xyz1, xyz2, reduction='none'):
+    """Squared point-wise deviation between two point clouds after alignment.
+    Args:
+        xyz1: (*, L, 3), to be transformed
+        xyz2: (*, L, 3), the reference
+    Returns:
+        rmsd: (*, ) or none: (*, L)
+    """
+    map_to_np = False
+    if not torch.is_tensor(xyz1):
+        map_to_np = True
+        xyz1 = torch.as_tensor(xyz1)
+        xyz2 = torch.as_tensor(xyz2)
+    R, t = _find_rigid_alignment(xyz1, xyz2)
+    # print(R.shape, t.shape) # B, 3, 3 & B, 3
+    # xyz1_aligned = (R.bmm(xyz1.transpose(-2,-1))).transpose(-2,-1) + t.unsqueeze(1)
+    xyz1_aligned = (torch.matmul(R, xyz1.transpose(-2, -1))).transpose(-2, -1) + t.unsqueeze(0)
+    sd = ((xyz1_aligned - xyz2)**2).sum(dim=-1)    # (*, L)
+    assert sd.shape == xyz1.shape[:-1]
+    if reduction == 'none':
+        pass
+    elif reduction == 'rmsd':
+        sd = torch.sqrt(sd.mean(dim=-1))
+    else:
+        raise NotImplementedError()
+    sd = sd.numpy() if map_to_np else sd
+    return sd
+def _find_rigid_alignment(src, tgt):
+    """Inspired by https://research.pasteur.fr/en/member/guillaume-bouvier/;
+        https://gist.github.com/bougui505/e392a371f5bab095a3673ea6f4976cc8
+    See: https://en.wikipedia.org/wiki/Kabsch_algorithm
+    2-D or 3-D registration with known correspondences.
+    Registration occurs in the zero centered coordinate system, and then
+    must be transported back.
+    Args:
+        src: Torch tensor of shape (*, L, 3) -- Point Cloud to Align (source)
+        tgt: Torch tensor of shape (*, L, 3) -- Reference Point Cloud (target)
+    Returns:
+        R: optimal rotation (*, 3, 3)
+        t: optimal translation (*, 3)
+    Test on rotation + translation and on rotation + translation + reflection
+        >>> A = torch.tensor([[1., 1.], [2., 2.], [1.5, 3.]], dtype=torch.float)
+        >>> R0 = torch.tensor([[np.cos(60), -np.sin(60)], [np.sin(60), np.cos(60)]], dtype=torch.float)
+        >>> B = (R0.mm(A.T)).T
+        >>> t0 = torch.tensor([3., 3.])
+        >>> B += t0
+        >>> R, t = find_rigid_alignment(A, B)
+        >>> A_aligned = (R.mm(A.T)).T + t
+        >>> rmsd = torch.sqrt(((A_aligned - B)**2).sum(axis=1).mean())
+        >>> rmsd
+        tensor(3.7064e-07)
+        >>> B *= torch.tensor([-1., 1.])
+        >>> R, t = find_rigid_alignment(A, B)
+        >>> A_aligned = (R.mm(A.T)).T + t
+        >>> rmsd = torch.sqrt(((A_aligned - B)**2).sum(axis=1).mean())
+        >>> rmsd
+        tensor(3.7064e-07)
+    """
+    assert src.shape[-2] > 1
+    src_com = src.mean(dim=-2, keepdim=True)
+    tgt_com = tgt.mean(dim=-2, keepdim=True)
+    src_centered = src - src_com
+    tgt_centered = tgt - tgt_com
+    # Covariance matrix
+    # H = src_centered.transpose(-2,-1).bmm(tgt_centered)    # *, 3, 3
+    H = torch.matmul(src_centered.transpose(-2,-1),  tgt_centered)
+    U, S, V = torch.svd(H)
+    # Rotation matrix
+    # R = V.bmm(U.transpose(-2,-1))
+    R = torch.matmul(V, U.transpose(-2, -1))
+    # Translation vector
+    # t = tgt_com - R.bmm(src_com.transpose(-2,-1)).transpose(-2,-1)
+    t = tgt_com - torch.matmul(R, src_com.transpose(-2, -1)).transpose(-2, -1)
+    return R, t.squeeze(-2) # (B, 3, 3), (B, 3)

analysis/src/common/pdb_utils.py ADDED Viewed

	@@ -0,0 +1,353 @@

+"""Utility functions for operating PDB files.
+"""
+import os
+import re
+from typing import Optional
+from collections import OrderedDict
+import numpy as np
+from tqdm import tqdm
+import biotite.structure as struct
+from biotite.structure.io.pdb import PDBFile
+from src.common import protein
+def write_pdb_string(pdb_string: str, save_to: str):
+    """Write pdb string to file"""
+    with open(save_to, 'w') as f:
+        f.write(pdb_string)
+def read_pdb_to_string(pdb_file):
+    """Read PDB file as pdb string. Convenient API"""
+    with open(pdb_file, 'r') as fi:
+        pdb_string = ''
+        for line in fi:
+            if line.startswith('END') or line.startswith('TER') \
+                    or line.startswith('MODEL') or line.startswith('ATOM'):
+                pdb_string += line
+        return pdb_string
+def merge_pdbfiles(input, output_file, verbose=True):
+    """ordered merging process of pdbs"""
+    if isinstance(input, str):
+        pdb_files = [os.path.join(input, f) for f in os.listdir(input) if f.endswith('.pdb')]
+    elif isinstance(input, list):
+        pdb_files = input
+    os.makedirs(os.path.dirname(output_file), exist_ok=True)
+    model_number = 0
+    pdb_lines = []
+    if verbose:
+        _iter = tqdm(pdb_files, desc='Merging PDBs')
+    else:
+        _iter = pdb_files
+    for pdb_file in _iter:
+        with open(pdb_file, 'r') as pdb:
+            lines = pdb.readlines()
+        single_model = True
+        for line in lines:
+            if line.startswith('MODEL') or line.startswith('ENDMDL'):
+                single_model = False
+                break
+        if single_model: # single model
+            model_number += 1
+            pdb_lines.append(f"MODEL     {model_number}")
+            for line in lines:
+                if line.startswith('TER') or line.startswith('ATOM'):
+                    pdb_lines.append(line.strip())
+            pdb_lines.append("ENDMDL")
+        else:        # multiple models
+            for line in lines:
+                if line.startswith('MODEL'):
+                    model_number += 1
+                    if model_number > 1:
+                        pdb_lines.append("ENDMDL")
+                    pdb_lines.append(f"MODEL     {model_number}")
+                elif line.startswith('END'):
+                    continue
+                elif line.startswith('TER') or line.startswith('ATOM'):
+                    pdb_lines.append(line.strip())
+    pdb_lines.append('ENDMDL')
+    pdb_lines.append('END')
+    pdb_lines = [_line.ljust(80) for _line in pdb_lines]
+    pdb_str = '\n'.join(pdb_lines) + '\n'
+    with open(output_file, 'w') as fo:
+        fo.write(pdb_str)
+    if verbose:
+        print(f"Merged {len(pdb_files)} PDB files into {output_file} with {model_number} models.")
+def split_pdbfile(pdb_file, output_dir=None, suffix='index', verbose=True):
+    """Split a PDB file into multiple PDB files in output_dir.
+    Preassume that each model is wrapped by 'MODEL' and 'ENDMDL'.
+    """
+    assert os.path.exists(pdb_file), f"File {pdb_file} does not exist."
+    assert suffix == 'index', 'Only support [suffix=index] for now.'
+    if output_dir is not None:  # also dump to output_dir
+        os.makedirs(output_dir, exist_ok=True)
+        base = os.path.splitext(os.path.basename(pdb_file))[0]
+    i = 0
+    pdb_strs = []
+    pdb_string = ''
+    with open(pdb_file, 'r') as fi:
+        # pdb_string = ''
+        for line in fi:
+            if line.startswith('MODEL'):
+                pdb_string = ''
+            elif line.startswith('ATOM') or line.startswith('TER'):
+                pdb_string += line
+            elif line.startswith('ENDMDL') or line.startswith('END'):
+                if pdb_string == '': continue
+                pdb_string += 'END\n'
+                if output_dir is not None:
+                    _save_to = os.path.join(output_dir, f'{base}_{i}.pdb') if suffix == 'index' else None
+                    with open(_save_to, 'w') as fo:
+                        fo.write(pdb_string)
+                pdb_strs.append(pdb_string)
+                pdb_string = ''
+                i += 1
+            else:
+                if verbose:
+                    print(f"Warning: line '{line}' is not recognized. Skip.")
+    if verbose:
+        print(f">>> Split pdb {pdb_file} into {i}/{len(pdb_strs)} structures.")
+    return pdb_strs
+def stratify_sample_pdbfile(input_path, output_path, n_max_sample=1000, end_at=0, verbose=True):
+    """ """
+    assert os.path.exists(input_path), f"File {input_path} does not exist."
+    assert not os.path.exists(output_path), f"Output path {output_path} already exists."
+    i = 0
+    pdb_strs = []
+    with open(input_path, 'r') as fi:
+        # pdb_string = ''
+        pdb_lines_per_model = []
+        for line in fi:
+            if line.startswith('MODEL'):
+                pdb_lines_per_model = []
+            elif line.startswith('ATOM') or line.startswith('TER'):
+                pdb_lines_per_model.append(line.strip())
+            elif line.startswith('ENDMDL') or line.startswith('END'):
+                if pdb_lines_per_model == []: continue  # skip empty model
+                # wrap up the model
+                pdb_lines_per_model.append('ENDMDL')
+                # Pad all lines to 80 characters.
+                pdb_lines_per_model = [_line.ljust(80) for _line in pdb_lines_per_model]
+                pdb_str_per_model = '\n'.join(pdb_lines_per_model) + '\n'  # Add terminating newline.
+                pdb_strs.append(pdb_str_per_model)
+                # reset
+                pdb_lines_per_model = []
+                i += 1
+            else:
+                if verbose:
+                    print(f"Warning: line '{line}' is not recognized. Skip.")
+            if end_at > 0 and i > end_at:
+                break
+    end =  end_at if end_at > 0 else len(pdb_strs)
+    # sample evenly
+    if end > n_max_sample:
+        interleave_step = int(end // n_max_sample) # floor
+        sampled_pdb_strs = pdb_strs[:end][::interleave_step][:n_max_sample]
+    else:
+        sampled_pdb_strs = pdb_strs[:end]
+    output_str = ''
+    for i, pdb_str  in enumerate(sampled_pdb_strs): # renumber models
+        output_str += f"MODEL     {i+1}".ljust(80) + '\n'
+        output_str += pdb_str
+    output_str = output_str + ('END'.ljust(80) + '\n')
+    write_pdb_string(output_str, save_to=output_path)
+    if verbose:
+        print(f">>> Split pdb {input_path} into {len(sampled_pdb_strs)}/{n_max_sample} structures.")
+    return
+def protein_with_default_params(
+    atom_positions: np.ndarray,
+    atom_mask: np.ndarray,
+    aatype: Optional[np.ndarray] = None,
+    b_factors: Optional[np.ndarray] = None,
+    chain_index: Optional[np.ndarray] = None,
+    residue_index: Optional[np.ndarray] = None,
+):
+    assert atom_positions.ndim == 3
+    assert atom_positions.shape[-1] == 3
+    assert atom_positions.shape[-2] == 37
+    n = atom_positions.shape[0]
+    sqz = lambda x: np.squeeze(x) if x.shape[0] == 1 and len(x.shape) > 1 else x
+    residue_index = np.arange(n) + 1 if residue_index is None else sqz(residue_index)
+    chain_index = np.zeros(n) if chain_index is None else sqz(chain_index)
+    b_factors = np.zeros([n, 37]) if b_factors is None else sqz(b_factors)
+    aatype = np.zeros(n, dtype=int) if aatype is None else sqz(aatype)
+    return protein.Protein(
+        atom_positions=atom_positions,
+        atom_mask=atom_mask,
+        aatype=aatype,
+        residue_index=residue_index,
+        chain_index=chain_index,
+        b_factors=b_factors
+    )
+def atom37_to_pdb(
+    save_to: str,
+    atom_positions: np.ndarray,
+    aatype: Optional[np.ndarray] = None,
+    b_factors: Optional[np.ndarray] = None,
+    chain_index: Optional[np.ndarray] = None,
+    residue_index: Optional[np.ndarray] = None,
+    overwrite: bool = False,
+    no_indexing: bool = True,
+):
+    # configure save path
+    if overwrite:
+        max_existing_idx = 0
+    else:
+        file_dir = os.path.dirname(save_to)
+        file_name = os.path.basename(save_to).strip('.pdb')
+        existing_files = [x for x in os.listdir(file_dir) if file_name in x]
+        max_existing_idx = max([
+            int(re.findall(r'_(\d+).pdb', x)[0]) for x in existing_files if re.findall(r'_(\d+).pdb', x)
+            if re.findall(r'_(\d+).pdb', x)] + [0])
+    if not no_indexing:
+        save_to = save_to.replace('.pdb', '') + f'_{max_existing_idx+1}.pdb'
+    else:
+        save_to = save_to
+    with open(save_to, 'w') as f:
+        if atom_positions.ndim == 4:
+            for mi, pos37 in enumerate(atom_positions):
+                atom_mask = np.sum(np.abs(pos37), axis=-1) > 1e-7
+                prot = protein_with_default_params(
+                    pos37, atom_mask, aatype=aatype, b_factors=b_factors,
+                    chain_index=chain_index, residue_index=residue_index
+                )
+                pdb_str = protein.to_pdb(prot, model=mi+1, add_end=False)
+                f.write(pdb_str)
+        elif atom_positions.ndim == 3:
+            atom_mask = np.sum(np.abs(atom_positions), axis=-1) > 1e-7
+            prot = protein_with_default_params(
+                atom_positions, atom_mask, aatype=aatype, b_factors=b_factors,
+                chain_index=chain_index, residue_index=residue_index
+            )
+            pdb_str = protein.to_pdb(prot, model=1, add_end=False)
+            f.write(pdb_str)
+        else:
+            raise ValueError(f'Invalid positions shape {atom_positions.shape}')
+        f.write('END')
+    return save_to
+def extract_backbone_coords_from_pdb(pdb_path: str, target_atoms: Optional[list] = ["CA"]):
+    structure = PDBFile.read(pdb_path)
+    structure_list = structure.get_structure()
+    coords_list = []
+    for b_idx in range(structure.get_model_count()):
+        chain = structure_list[b_idx]
+        backbone_atoms = chain[struct.filter_backbone(chain)]   # This includes the “N”, “CA” and “C” atoms of amino acids.
+        ret_coords = OrderedDict()
+        # init dict
+        for k in target_atoms:
+            ret_coords[k] = []
+        for c in backbone_atoms:
+            if c.atom_name in ret_coords:
+                ret_coords[c.atom_name].append(c.coord)
+        ret_coords = [np.vstack(v) for k,v in ret_coords.items()]
+        if len(target_atoms) == 1:
+            ret_coords = ret_coords[0]  # L, 3
+        else:
+            ret_coords = np.stack(ret_coords, axis=1)   # L, na, 3
+        coords_list.append(ret_coords)
+    coords_list = np.stack(coords_list, axis=0) # B, L, na, 3 or B, L, 3 (ca only)
+    return coords_list
+def extract_backbone_coords_from_pdb_dir(pdb_dir: str):
+    return np.concatenate([
+            extract_backbone_coords_from_pdb(os.path.join(pdb_dir, f))
+                for f in os.listdir(pdb_dir) if f.endswith('.pdb')
+        ], axis=0)
+def extract_backbone_coords_from_npy(npy_path: str):
+    return np.load(npy_path)
+def extract_backbone_coords(input_path: str,
+                            max_n_model: Optional[int] = None,
+):
+    """Extract backbone coordinates from PDB file.
+    Args:
+        input_path (str): The path to the PDB file.
+        ca_only (bool): Whether to extract only CA coordinates.
+        max_n_model (int): The maximum number of models to extract.
+    """
+    assert os.path.exists(input_path), f"File {input_path} does not exist."
+    if input_path.endswith('.pdb'):
+        coords = extract_backbone_coords_from_pdb(input_path)
+    elif input_path.endswith('.npy'):
+        coords = extract_backbone_coords_from_npy(input_path)
+    elif os.path.isdir(input_path):
+        coords = extract_backbone_coords_from_pdb_dir(input_path)
+    else:
+        raise ValueError(f"Unrecognized input path {input_path}.")
+    if max_n_model is not None and len(coords) > max_n_model > 0:
+        coords = coords[:max_n_model]
+    return coords
+if __name__ == '__main__':
+    import argparse
+    def get_argparser():
+        parser = argparse.ArgumentParser(description='Main script for pdb processing.')
+        parser.add_argument("input", type=str, help="The generic path to sampled pdb directory / pdb file.")
+        parser.add_argument("-m", "--mode", type=str, help="The mode of processing.",
+                            default="split")
+        parser.add_argument("-o", "--output", type=str, help="The output directory for processed pdb files.",
+                            default=None)
+        args = parser.parse_args()
+        return args
+    args = get_argparser()
+    # ad hoc functions
+    def split_pdbs(args):
+        os.makedirs(args.output, exist_ok=True)
+        _ = split_pdbfile(pdb_file=args.input,
+                        output_dir=args.output)
+    def merge_pdbs(args):
+        output = args.output or f"{args.input}_all.pdb"
+        merge_pdbfiles(input=args.input,
+                        output_file=output)
+    if args.mode == "split":
+        split_pdbs(args)
+    elif args.mode == "merge":
+        merge_pdbs(args)
+    elif args.mode == "stratify":
+        stratify_sample_pdbfile(input_path=args.input, output_path=args.output)
+    else:
+        raise ValueError(f"Unrecognized mode {args.mode}.")

analysis/src/common/protein.py ADDED Viewed

	@@ -0,0 +1,289 @@

+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Protein data type."""
+import dataclasses
+import io
+from typing import Any, Mapping, Optional
+from Bio.PDB import PDBParser
+import numpy as np
+from src.common import residue_constants
+FeatureDict = Mapping[str, np.ndarray]
+ModelOutput = Mapping[str, Any]  # Is a nested dict.
+# Complete sequence of chain IDs supported by the PDB format.
+PDB_CHAIN_IDS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'
+PDB_MAX_CHAINS = len(PDB_CHAIN_IDS)  # := 62.
+@dataclasses.dataclass(frozen=True)
+class Protein:
+  """Protein structure representation."""
+  # Cartesian coordinates of atoms in angstroms. The atom types correspond to
+  # residue_constants.atom_types, i.e. the first three are N, CA, CB.
+  atom_positions: np.ndarray  # [num_res, num_atom_type, 3]
+  # Amino-acid type for each residue represented as an integer between 0 and
+  # 20, where 20 is 'X'.
+  aatype: np.ndarray  # [num_res]
+  # Binary float mask to indicate presence of a particular atom. 1.0 if an atom
+  # is present and 0.0 if not. This should be used for loss masking.
+  atom_mask: np.ndarray  # [num_res, num_atom_type]
+  # Residue index as used in PDB. It is not necessarily continuous or 0-indexed.
+  residue_index: np.ndarray  # [num_res]
+  # 0-indexed number corresponding to the chain in the protein that this residue
+  # belongs to.
+  chain_index: np.ndarray  # [num_res]
+  # B-factors, or temperature factors, of each residue (in sq. angstroms units),
+  # representing the displacement of the residue from its ground truth mean
+  # value.
+  b_factors: np.ndarray  # [num_res, num_atom_type]
+  def __post_init__(self):
+    if len(np.unique(self.chain_index)) > PDB_MAX_CHAINS:
+      raise ValueError(
+          f'Cannot build an instance with more than {PDB_MAX_CHAINS} chains '
+          'because these cannot be written to PDB format.')
+  def to_dict(self):
+    return dataclasses.asdict(self)
+def from_pdb_string(pdb_str: str, chain_id: Optional[str] = None) -> Protein:
+  """Takes a PDB string and constructs a Protein object.
+  WARNING: All non-standard residue types will be converted into UNK. All
+    non-standard atoms will be ignored.
+  Args:
+    pdb_str: The contents of the pdb file
+    chain_id: If chain_id is specified (e.g. A), then only that chain
+      is parsed. Otherwise all chains are parsed.
+  Returns:
+    A new `Protein` parsed from the pdb contents.
+  """
+  pdb_fh = io.StringIO(pdb_str)
+  parser = PDBParser(QUIET=True)
+  structure = parser.get_structure('none', pdb_fh)
+  models = list(structure.get_models())
+  if len(models) != 1:
+    raise ValueError(
+        f'Only single model PDBs are supported. Found {len(models)} models.')
+  model = models[0]
+  atom_positions = []
+  aatype = []
+  atom_mask = []
+  residue_index = []
+  chain_ids = []
+  b_factors = []
+  for chain in model:
+    if chain_id is not None and chain.id != chain_id:
+      continue
+    for res in chain:
+      if res.id[2] != ' ':
+        raise ValueError(
+            f'PDB contains an insertion code at chain {chain.id} and residue '
+            f'index {res.id[1]}. These are not supported.')
+      res_shortname = residue_constants.restype_3to1.get(res.resname, 'X')
+      restype_idx = residue_constants.restype_order.get(
+          res_shortname, residue_constants.restype_num)
+      pos = np.zeros((residue_constants.atom_type_num, 3))
+      mask = np.zeros((residue_constants.atom_type_num,))
+      res_b_factors = np.zeros((residue_constants.atom_type_num,))
+      for atom in res:
+        if atom.name not in residue_constants.atom_types:
+          continue
+        pos[residue_constants.atom_order[atom.name]] = atom.coord
+        mask[residue_constants.atom_order[atom.name]] = 1.
+        res_b_factors[residue_constants.atom_order[atom.name]] = atom.bfactor
+      if np.sum(mask) < 0.5:
+        # If no known atom positions are reported for the residue then skip it.
+        continue
+      aatype.append(restype_idx)
+      atom_positions.append(pos)
+      atom_mask.append(mask)
+      residue_index.append(res.id[1])
+      chain_ids.append(chain.id)
+      b_factors.append(res_b_factors)
+  # Chain IDs are usually characters so map these to ints.
+  unique_chain_ids = np.unique(chain_ids)
+  chain_id_mapping = {cid: n for n, cid in enumerate(unique_chain_ids)}
+  chain_index = np.array([chain_id_mapping[cid] for cid in chain_ids])
+  return Protein(
+      atom_positions=np.array(atom_positions),
+      atom_mask=np.array(atom_mask),
+      aatype=np.array(aatype),
+      residue_index=np.array(residue_index),
+      chain_index=chain_index,
+      b_factors=np.array(b_factors))
+def _chain_end(atom_index, end_resname, chain_name, residue_index) -> str:
+  chain_end = 'TER'
+  return (f'{chain_end:<6}{atom_index:>5}      {end_resname:>3} '
+          f'{chain_name:>1}{residue_index:>4}')
+def to_pdb(prot: Protein, model=1, add_end=True) -> str:
+  """Converts a `Protein` instance to a PDB string.
+  Args:
+    prot: The protein to convert to PDB.
+  Returns:
+    PDB string.
+  """
+  restypes = residue_constants.restypes + ['X']
+  res_1to3 = lambda r: residue_constants.restype_1to3.get(restypes[r], 'UNK')
+  atom_types = residue_constants.atom_types
+  pdb_lines = []
+  atom_mask = prot.atom_mask
+  aatype = prot.aatype
+  atom_positions = prot.atom_positions
+  residue_index = prot.residue_index.astype(int)
+  chain_index = prot.chain_index.astype(int)
+  b_factors = prot.b_factors
+  if np.any(aatype > residue_constants.restype_num):
+    raise ValueError('Invalid aatypes.')
+  # Construct a mapping from chain integer indices to chain ID strings.
+  chain_ids = {}
+  for i in np.unique(chain_index):  # np.unique gives sorted output.
+    if i >= PDB_MAX_CHAINS:
+      raise ValueError(
+          f'The PDB format supports at most {PDB_MAX_CHAINS} chains.')
+    chain_ids[i] = PDB_CHAIN_IDS[i]
+  pdb_lines.append(f'MODEL     {model}')
+  atom_index = 1
+  last_chain_index = chain_index[0]
+  # Add all atom sites.
+  for i in range(aatype.shape[0]):
+    # Close the previous chain if in a multichain PDB.
+    if last_chain_index != chain_index[i]:
+      pdb_lines.append(_chain_end(
+          atom_index, res_1to3(aatype[i - 1]), chain_ids[chain_index[i - 1]],
+          residue_index[i - 1]))
+      last_chain_index = chain_index[i]
+      atom_index += 1  # Atom index increases at the TER symbol.
+    res_name_3 = res_1to3(aatype[i])
+    for atom_name, pos, mask, b_factor in zip(
+        atom_types, atom_positions[i], atom_mask[i], b_factors[i]):
+      if mask < 0.5:
+        continue
+      # skip CB for GLY
+      if res_name_3 == 'GLY' and atom_name == 'CB':
+        continue
+      record_type = 'ATOM'
+      name = atom_name if len(atom_name) == 4 else f' {atom_name}'
+      alt_loc = ''
+      insertion_code = ''
+      occupancy = 1.00
+      element = atom_name[0]  # Protein supports only C, N, O, S, this works.
+      charge = ''
+      # PDB is a columnar format, every space matters here!
+      atom_line = (f'{record_type:<6}{atom_index:>5} {name:<4}{alt_loc:>1}'
+                   f'{res_name_3:>3} {chain_ids[chain_index[i]]:>1}'
+                   f'{residue_index[i]:>4}{insertion_code:>1}   '
+                   f'{pos[0]:>8.3f}{pos[1]:>8.3f}{pos[2]:>8.3f}'
+                   f'{occupancy:>6.2f}{b_factor:>6.2f}          '
+                   f'{element:>2}{charge:>2}')
+      pdb_lines.append(atom_line)
+      atom_index += 1
+  # Close the final chain.
+  pdb_lines.append(_chain_end(atom_index, res_1to3(aatype[-1]),
+                              chain_ids[chain_index[-1]], residue_index[-1]))
+  pdb_lines.append('ENDMDL')
+  if add_end:
+    pdb_lines.append('END')
+  # Pad all lines to 80 characters.
+  pdb_lines = [line.ljust(80) for line in pdb_lines]
+  return '\n'.join(pdb_lines) + '\n'  # Add terminating newline.
+def ideal_atom_mask(prot: Protein) -> np.ndarray:
+  """Computes an ideal atom mask.
+  `Protein.atom_mask` typically is defined according to the atoms that are
+  reported in the PDB. This function computes a mask according to heavy atoms
+  that should be present in the given sequence of amino acids.
+  Args:
+    prot: `Protein` whose fields are `numpy.ndarray` objects.
+  Returns:
+    An ideal atom mask.
+  """
+  return residue_constants.STANDARD_ATOM_MASK[prot.aatype]
+def from_prediction(
+    features: FeatureDict,
+    result: ModelOutput,
+    b_factors: Optional[np.ndarray] = None,
+    remove_leading_feature_dimension: bool = True) -> Protein:
+  """Assembles a protein from a prediction.
+  Args:
+    features: Dictionary holding model inputs.
+    result: Dictionary holding model outputs.
+    b_factors: (Optional) B-factors to use for the protein.
+    remove_leading_feature_dimension: Whether to remove the leading dimension
+      of the `features` values.
+  Returns:
+    A protein instance.
+  """
+  fold_output = result['structure_module']
+  def _maybe_remove_leading_dim(arr: np.ndarray) -> np.ndarray:
+    return arr[0] if remove_leading_feature_dimension else arr
+  if 'asym_id' in features:
+    chain_index = _maybe_remove_leading_dim(features['asym_id'])
+  else:
+    chain_index = np.zeros_like(_maybe_remove_leading_dim(features['aatype']))
+  if b_factors is None:
+    b_factors = np.zeros_like(fold_output['final_atom_mask'])
+  return Protein(
+      aatype=_maybe_remove_leading_dim(features['aatype']),
+      atom_positions=fold_output['final_atom_positions'],
+      atom_mask=fold_output['final_atom_mask'],
+      residue_index=_maybe_remove_leading_dim(features['residue_index']) + 1,
+      chain_index=chain_index,
+      b_factors=b_factors)

analysis/src/common/residue_constants.py ADDED Viewed

	@@ -0,0 +1,897 @@

+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Constants used in AlphaFold."""
+import collections
+import functools
+import os
+from typing import List, Mapping, Tuple
+import numpy as np
+import tree
+# Internal import (35fd).
+# Distance from one CA to next CA [trans configuration: omega = 180].
+ca_ca = 3.80209737096
+# Format: The list for each AA type contains chi1, chi2, chi3, chi4 in
+# this order (or a relevant subset from chi1 onwards). ALA and GLY don't have
+# chi angles so their chi angle lists are empty.
+chi_angles_atoms = {
+    'ALA': [],
+    # Chi5 in arginine is always 0 +- 5 degrees, so ignore it.
+    'ARG': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'],
+            ['CB', 'CG', 'CD', 'NE'], ['CG', 'CD', 'NE', 'CZ']],
+    'ASN': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'OD1']],
+    'ASP': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'OD1']],
+    'CYS': [['N', 'CA', 'CB', 'SG']],
+    'GLN': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'],
+            ['CB', 'CG', 'CD', 'OE1']],
+    'GLU': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'],
+            ['CB', 'CG', 'CD', 'OE1']],
+    'GLY': [],
+    'HIS': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'ND1']],
+    'ILE': [['N', 'CA', 'CB', 'CG1'], ['CA', 'CB', 'CG1', 'CD1']],
+    'LEU': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],
+    'LYS': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'],
+            ['CB', 'CG', 'CD', 'CE'], ['CG', 'CD', 'CE', 'NZ']],
+    'MET': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'SD'],
+            ['CB', 'CG', 'SD', 'CE']],
+    'PHE': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],
+    'PRO': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD']],
+    'SER': [['N', 'CA', 'CB', 'OG']],
+    'THR': [['N', 'CA', 'CB', 'OG1']],
+    'TRP': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],
+    'TYR': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],
+    'VAL': [['N', 'CA', 'CB', 'CG1']],
+}
+# If chi angles given in fixed-length array, this matrix determines how to mask
+# them for each AA type. The order is as per restype_order (see below).
+chi_angles_mask = [
+    [0.0, 0.0, 0.0, 0.0],  # ALA
+    [1.0, 1.0, 1.0, 1.0],  # ARG
+    [1.0, 1.0, 0.0, 0.0],  # ASN
+    [1.0, 1.0, 0.0, 0.0],  # ASP
+    [1.0, 0.0, 0.0, 0.0],  # CYS
+    [1.0, 1.0, 1.0, 0.0],  # GLN
+    [1.0, 1.0, 1.0, 0.0],  # GLU
+    [0.0, 0.0, 0.0, 0.0],  # GLY
+    [1.0, 1.0, 0.0, 0.0],  # HIS
+    [1.0, 1.0, 0.0, 0.0],  # ILE
+    [1.0, 1.0, 0.0, 0.0],  # LEU
+    [1.0, 1.0, 1.0, 1.0],  # LYS
+    [1.0, 1.0, 1.0, 0.0],  # MET
+    [1.0, 1.0, 0.0, 0.0],  # PHE
+    [1.0, 1.0, 0.0, 0.0],  # PRO
+    [1.0, 0.0, 0.0, 0.0],  # SER
+    [1.0, 0.0, 0.0, 0.0],  # THR
+    [1.0, 1.0, 0.0, 0.0],  # TRP
+    [1.0, 1.0, 0.0, 0.0],  # TYR
+    [1.0, 0.0, 0.0, 0.0],  # VAL
+]
+# The following chi angles are pi periodic: they can be rotated by a multiple
+# of pi without affecting the structure.
+chi_pi_periodic = [
+    [0.0, 0.0, 0.0, 0.0],  # ALA
+    [0.0, 0.0, 0.0, 0.0],  # ARG
+    [0.0, 0.0, 0.0, 0.0],  # ASN
+    [0.0, 1.0, 0.0, 0.0],  # ASP
+    [0.0, 0.0, 0.0, 0.0],  # CYS
+    [0.0, 0.0, 0.0, 0.0],  # GLN
+    [0.0, 0.0, 1.0, 0.0],  # GLU
+    [0.0, 0.0, 0.0, 0.0],  # GLY
+    [0.0, 0.0, 0.0, 0.0],  # HIS
+    [0.0, 0.0, 0.0, 0.0],  # ILE
+    [0.0, 0.0, 0.0, 0.0],  # LEU
+    [0.0, 0.0, 0.0, 0.0],  # LYS
+    [0.0, 0.0, 0.0, 0.0],  # MET
+    [0.0, 1.0, 0.0, 0.0],  # PHE
+    [0.0, 0.0, 0.0, 0.0],  # PRO
+    [0.0, 0.0, 0.0, 0.0],  # SER
+    [0.0, 0.0, 0.0, 0.0],  # THR
+    [0.0, 0.0, 0.0, 0.0],  # TRP
+    [0.0, 1.0, 0.0, 0.0],  # TYR
+    [0.0, 0.0, 0.0, 0.0],  # VAL
+    [0.0, 0.0, 0.0, 0.0],  # UNK
+]
+# Atoms positions relative to the 8 rigid groups, defined by the pre-omega, phi,
+# psi and chi angles:
+# 0: 'backbone group',
+# 1: 'pre-omega-group', (empty)
+# 2: 'phi-group', (currently empty, because it defines only hydrogens)
+# 3: 'psi-group',
+# 4,5,6,7: 'chi1,2,3,4-group'
+# The atom positions are relative to the axis-end-atom of the corresponding
+# rotation axis. The x-axis is in direction of the rotation axis, and the y-axis
+# is defined such that the dihedral-angle-definiting atom (the last entry in
+# chi_angles_atoms above) is in the xy-plane (with a positive y-coordinate).
+# format: [atomname, group_idx, rel_position]
+rigid_group_atom_positions = {
+    'ALA': [
+        ['N', 0, (-0.525, 1.363, 0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.526, -0.000, -0.000)],
+        ['CB', 0, (-0.529, -0.774, -1.205)],
+        ['O', 3, (0.627, 1.062, 0.000)],
+    ],
+    'ARG': [
+        ['N', 0, (-0.524, 1.362, -0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.525, -0.000, -0.000)],
+        ['CB', 0, (-0.524, -0.778, -1.209)],
+        ['O', 3, (0.626, 1.062, 0.000)],
+        ['CG', 4, (0.616, 1.390, -0.000)],
+        ['CD', 5, (0.564, 1.414, 0.000)],
+        ['NE', 6, (0.539, 1.357, -0.000)],
+        ['NH1', 7, (0.206, 2.301, 0.000)],
+        ['NH2', 7, (2.078, 0.978, -0.000)],
+        ['CZ', 7, (0.758, 1.093, -0.000)],
+    ],
+    'ASN': [
+        ['N', 0, (-0.536, 1.357, 0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.526, -0.000, -0.000)],
+        ['CB', 0, (-0.531, -0.787, -1.200)],
+        ['O', 3, (0.625, 1.062, 0.000)],
+        ['CG', 4, (0.584, 1.399, 0.000)],
+        ['ND2', 5, (0.593, -1.188, 0.001)],
+        ['OD1', 5, (0.633, 1.059, 0.000)],
+    ],
+    'ASP': [
+        ['N', 0, (-0.525, 1.362, -0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.527, 0.000, -0.000)],
+        ['CB', 0, (-0.526, -0.778, -1.208)],
+        ['O', 3, (0.626, 1.062, -0.000)],
+        ['CG', 4, (0.593, 1.398, -0.000)],
+        ['OD1', 5, (0.610, 1.091, 0.000)],
+        ['OD2', 5, (0.592, -1.101, -0.003)],
+    ],
+    'CYS': [
+        ['N', 0, (-0.522, 1.362, -0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.524, 0.000, 0.000)],
+        ['CB', 0, (-0.519, -0.773, -1.212)],
+        ['O', 3, (0.625, 1.062, -0.000)],
+        ['SG', 4, (0.728, 1.653, 0.000)],
+    ],
+    'GLN': [
+        ['N', 0, (-0.526, 1.361, -0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.526, 0.000, 0.000)],
+        ['CB', 0, (-0.525, -0.779, -1.207)],
+        ['O', 3, (0.626, 1.062, -0.000)],
+        ['CG', 4, (0.615, 1.393, 0.000)],
+        ['CD', 5, (0.587, 1.399, -0.000)],
+        ['NE2', 6, (0.593, -1.189, -0.001)],
+        ['OE1', 6, (0.634, 1.060, 0.000)],
+    ],
+    'GLU': [
+        ['N', 0, (-0.528, 1.361, 0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.526, -0.000, -0.000)],
+        ['CB', 0, (-0.526, -0.781, -1.207)],
+        ['O', 3, (0.626, 1.062, 0.000)],
+        ['CG', 4, (0.615, 1.392, 0.000)],
+        ['CD', 5, (0.600, 1.397, 0.000)],
+        ['OE1', 6, (0.607, 1.095, -0.000)],
+        ['OE2', 6, (0.589, -1.104, -0.001)],
+    ],
+    'GLY': [
+        ['N', 0, (-0.572, 1.337, 0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.517, -0.000, -0.000)],
+        ['O', 3, (0.626, 1.062, -0.000)],
+    ],
+    'HIS': [
+        ['N', 0, (-0.527, 1.360, 0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.525, 0.000, 0.000)],
+        ['CB', 0, (-0.525, -0.778, -1.208)],
+        ['O', 3, (0.625, 1.063, 0.000)],
+        ['CG', 4, (0.600, 1.370, -0.000)],
+        ['CD2', 5, (0.889, -1.021, 0.003)],
+        ['ND1', 5, (0.744, 1.160, -0.000)],
+        ['CE1', 5, (2.030, 0.851, 0.002)],
+        ['NE2', 5, (2.145, -0.466, 0.004)],
+    ],
+    'ILE': [
+        ['N', 0, (-0.493, 1.373, -0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.527, -0.000, -0.000)],
+        ['CB', 0, (-0.536, -0.793, -1.213)],
+        ['O', 3, (0.627, 1.062, -0.000)],
+        ['CG1', 4, (0.534, 1.437, -0.000)],
+        ['CG2', 4, (0.540, -0.785, -1.199)],
+        ['CD1', 5, (0.619, 1.391, 0.000)],
+    ],
+    'LEU': [
+        ['N', 0, (-0.520, 1.363, 0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.525, -0.000, -0.000)],
+        ['CB', 0, (-0.522, -0.773, -1.214)],
+        ['O', 3, (0.625, 1.063, -0.000)],
+        ['CG', 4, (0.678, 1.371, 0.000)],
+        ['CD1', 5, (0.530, 1.430, -0.000)],
+        ['CD2', 5, (0.535, -0.774, 1.200)],
+    ],
+    'LYS': [
+        ['N', 0, (-0.526, 1.362, -0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.526, 0.000, 0.000)],
+        ['CB', 0, (-0.524, -0.778, -1.208)],
+        ['O', 3, (0.626, 1.062, -0.000)],
+        ['CG', 4, (0.619, 1.390, 0.000)],
+        ['CD', 5, (0.559, 1.417, 0.000)],
+        ['CE', 6, (0.560, 1.416, 0.000)],
+        ['NZ', 7, (0.554, 1.387, 0.000)],
+    ],
+    'MET': [
+        ['N', 0, (-0.521, 1.364, -0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.525, 0.000, 0.000)],
+        ['CB', 0, (-0.523, -0.776, -1.210)],
+        ['O', 3, (0.625, 1.062, -0.000)],
+        ['CG', 4, (0.613, 1.391, -0.000)],
+        ['SD', 5, (0.703, 1.695, 0.000)],
+        ['CE', 6, (0.320, 1.786, -0.000)],
+    ],
+    'PHE': [
+        ['N', 0, (-0.518, 1.363, 0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.524, 0.000, -0.000)],
+        ['CB', 0, (-0.525, -0.776, -1.212)],
+        ['O', 3, (0.626, 1.062, -0.000)],
+        ['CG', 4, (0.607, 1.377, 0.000)],
+        ['CD1', 5, (0.709, 1.195, -0.000)],
+        ['CD2', 5, (0.706, -1.196, 0.000)],
+        ['CE1', 5, (2.102, 1.198, -0.000)],
+        ['CE2', 5, (2.098, -1.201, -0.000)],
+        ['CZ', 5, (2.794, -0.003, -0.001)],
+    ],
+    'PRO': [
+        ['N', 0, (-0.566, 1.351, -0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.527, -0.000, 0.000)],
+        ['CB', 0, (-0.546, -0.611, -1.293)],
+        ['O', 3, (0.621, 1.066, 0.000)],
+        ['CG', 4, (0.382, 1.445, 0.0)],
+        # ['CD', 5, (0.427, 1.440, 0.0)],
+        ['CD', 5, (0.477, 1.424, 0.0)],  # manually made angle 2 degrees larger
+    ],
+    'SER': [
+        ['N', 0, (-0.529, 1.360, -0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.525, -0.000, -0.000)],
+        ['CB', 0, (-0.518, -0.777, -1.211)],
+        ['O', 3, (0.626, 1.062, -0.000)],
+        ['OG', 4, (0.503, 1.325, 0.000)],
+    ],
+    'THR': [
+        ['N', 0, (-0.517, 1.364, 0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.526, 0.000, -0.000)],
+        ['CB', 0, (-0.516, -0.793, -1.215)],
+        ['O', 3, (0.626, 1.062, 0.000)],
+        ['CG2', 4, (0.550, -0.718, -1.228)],
+        ['OG1', 4, (0.472, 1.353, 0.000)],
+    ],
+    'TRP': [
+        ['N', 0, (-0.521, 1.363, 0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.525, -0.000, 0.000)],
+        ['CB', 0, (-0.523, -0.776, -1.212)],
+        ['O', 3, (0.627, 1.062, 0.000)],
+        ['CG', 4, (0.609, 1.370, -0.000)],
+        ['CD1', 5, (0.824, 1.091, 0.000)],
+        ['CD2', 5, (0.854, -1.148, -0.005)],
+        ['CE2', 5, (2.186, -0.678, -0.007)],
+        ['CE3', 5, (0.622, -2.530, -0.007)],
+        ['NE1', 5, (2.140, 0.690, -0.004)],
+        ['CH2', 5, (3.028, -2.890, -0.013)],
+        ['CZ2', 5, (3.283, -1.543, -0.011)],
+        ['CZ3', 5, (1.715, -3.389, -0.011)],
+    ],
+    'TYR': [
+        ['N', 0, (-0.522, 1.362, 0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.524, -0.000, -0.000)],
+        ['CB', 0, (-0.522, -0.776, -1.213)],
+        ['O', 3, (0.627, 1.062, -0.000)],
+        ['CG', 4, (0.607, 1.382, -0.000)],
+        ['CD1', 5, (0.716, 1.195, -0.000)],
+        ['CD2', 5, (0.713, -1.194, -0.001)],
+        ['CE1', 5, (2.107, 1.200, -0.002)],
+        ['CE2', 5, (2.104, -1.201, -0.003)],
+        ['OH', 5, (4.168, -0.002, -0.005)],
+        ['CZ', 5, (2.791, -0.001, -0.003)],
+    ],
+    'VAL': [
+        ['N', 0, (-0.494, 1.373, -0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.527, -0.000, -0.000)],
+        ['CB', 0, (-0.533, -0.795, -1.213)],
+        ['O', 3, (0.627, 1.062, -0.000)],
+        ['CG1', 4, (0.540, 1.429, -0.000)],
+        ['CG2', 4, (0.533, -0.776, 1.203)],
+    ],
+}
+# A list of atoms (excluding hydrogen) for each AA type. PDB naming convention.
+residue_atoms = {
+    'ALA': ['C', 'CA', 'CB', 'N', 'O'],
+    'ARG': ['C', 'CA', 'CB', 'CG', 'CD', 'CZ', 'N', 'NE', 'O', 'NH1', 'NH2'],
+    'ASP': ['C', 'CA', 'CB', 'CG', 'N', 'O', 'OD1', 'OD2'],
+    'ASN': ['C', 'CA', 'CB', 'CG', 'N', 'ND2', 'O', 'OD1'],
+    'CYS': ['C', 'CA', 'CB', 'N', 'O', 'SG'],
+    'GLU': ['C', 'CA', 'CB', 'CG', 'CD', 'N', 'O', 'OE1', 'OE2'],
+    'GLN': ['C', 'CA', 'CB', 'CG', 'CD', 'N', 'NE2', 'O', 'OE1'],
+    'GLY': ['C', 'CA', 'N', 'O'],
+    'HIS': ['C', 'CA', 'CB', 'CG', 'CD2', 'CE1', 'N', 'ND1', 'NE2', 'O'],
+    'ILE': ['C', 'CA', 'CB', 'CG1', 'CG2', 'CD1', 'N', 'O'],
+    'LEU': ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'N', 'O'],
+    'LYS': ['C', 'CA', 'CB', 'CG', 'CD', 'CE', 'N', 'NZ', 'O'],
+    'MET': ['C', 'CA', 'CB', 'CG', 'CE', 'N', 'O', 'SD'],
+    'PHE': ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', 'N', 'O'],
+    'PRO': ['C', 'CA', 'CB', 'CG', 'CD', 'N', 'O'],
+    'SER': ['C', 'CA', 'CB', 'N', 'O', 'OG'],
+    'THR': ['C', 'CA', 'CB', 'CG2', 'N', 'O', 'OG1'],
+    'TRP': ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'CE2', 'CE3', 'CZ2', 'CZ3',
+            'CH2', 'N', 'NE1', 'O'],
+    'TYR': ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', 'N', 'O',
+            'OH'],
+    'VAL': ['C', 'CA', 'CB', 'CG1', 'CG2', 'N', 'O']
+}
+# Naming swaps for ambiguous atom names.
+# Due to symmetries in the amino acids the naming of atoms is ambiguous in
+# 4 of the 20 amino acids.
+# (The LDDT paper lists 7 amino acids as ambiguous, but the naming ambiguities
+# in LEU, VAL and ARG can be resolved by using the 3d constellations of
+# the 'ambiguous' atoms and their neighbours)
+residue_atom_renaming_swaps = {
+    'ASP': {'OD1': 'OD2'},
+    'GLU': {'OE1': 'OE2'},
+    'PHE': {'CD1': 'CD2', 'CE1': 'CE2'},
+    'TYR': {'CD1': 'CD2', 'CE1': 'CE2'},
+}
+# Van der Waals radii [Angstroem] of the atoms (from Wikipedia)
+van_der_waals_radius = {
+    'C': 1.7,
+    'N': 1.55,
+    'O': 1.52,
+    'S': 1.8,
+}
+Bond = collections.namedtuple(
+    'Bond', ['atom1_name', 'atom2_name', 'length', 'stddev'])
+BondAngle = collections.namedtuple(
+    'BondAngle',
+    ['atom1_name', 'atom2_name', 'atom3name', 'angle_rad', 'stddev'])
+@functools.lru_cache(maxsize=None)
+def load_stereo_chemical_props() -> Tuple[Mapping[str, List[Bond]],
+                                          Mapping[str, List[Bond]],
+                                          Mapping[str, List[BondAngle]]]:
+  """Load stereo_chemical_props.txt into a nice structure.
+  Load literature values for bond lengths and bond angles and translate
+  bond angles into the length of the opposite edge of the triangle
+  ("residue_virtual_bonds").
+  Returns:
+    residue_bonds: Dict that maps resname -> list of Bond tuples.
+    residue_virtual_bonds: Dict that maps resname -> list of Bond tuples.
+    residue_bond_angles: Dict that maps resname -> list of BondAngle tuples.
+  """
+  stereo_chemical_props_path = os.path.join(
+      os.path.dirname(os.path.abspath(__file__)), 'stereo_chemical_props.txt'
+  )
+  with open(stereo_chemical_props_path, 'rt') as f:
+    stereo_chemical_props = f.read()
+  lines_iter = iter(stereo_chemical_props.splitlines())
+  # Load bond lengths.
+  residue_bonds = {}
+  next(lines_iter)  # Skip header line.
+  for line in lines_iter:
+    if line.strip() == '-':
+      break
+    bond, resname, length, stddev = line.split()
+    atom1, atom2 = bond.split('-')
+    if resname not in residue_bonds:
+      residue_bonds[resname] = []
+    residue_bonds[resname].append(
+        Bond(atom1, atom2, float(length), float(stddev)))
+  residue_bonds['UNK'] = []
+  # Load bond angles.
+  residue_bond_angles = {}
+  next(lines_iter)  # Skip empty line.
+  next(lines_iter)  # Skip header line.
+  for line in lines_iter:
+    if line.strip() == '-':
+      break
+    bond, resname, angle_degree, stddev_degree = line.split()
+    atom1, atom2, atom3 = bond.split('-')
+    if resname not in residue_bond_angles:
+      residue_bond_angles[resname] = []
+    residue_bond_angles[resname].append(
+        BondAngle(atom1, atom2, atom3,
+                  float(angle_degree) / 180. * np.pi,
+                  float(stddev_degree) / 180. * np.pi))
+  residue_bond_angles['UNK'] = []
+  def make_bond_key(atom1_name, atom2_name):
+    """Unique key to lookup bonds."""
+    return '-'.join(sorted([atom1_name, atom2_name]))
+  # Translate bond angles into distances ("virtual bonds").
+  residue_virtual_bonds = {}
+  for resname, bond_angles in residue_bond_angles.items():
+    # Create a fast lookup dict for bond lengths.
+    bond_cache = {}
+    for b in residue_bonds[resname]:
+      bond_cache[make_bond_key(b.atom1_name, b.atom2_name)] = b
+    residue_virtual_bonds[resname] = []
+    for ba in bond_angles:
+      bond1 = bond_cache[make_bond_key(ba.atom1_name, ba.atom2_name)]
+      bond2 = bond_cache[make_bond_key(ba.atom2_name, ba.atom3name)]
+      # Compute distance between atom1 and atom3 using the law of cosines
+      # c^2 = a^2 + b^2 - 2ab*cos(gamma).
+      gamma = ba.angle_rad
+      length = np.sqrt(bond1.length**2 + bond2.length**2
+                       - 2 * bond1.length * bond2.length * np.cos(gamma))
+      # Propagation of uncertainty assuming uncorrelated errors.
+      dl_outer = 0.5 / length
+      dl_dgamma = (2 * bond1.length * bond2.length * np.sin(gamma)) * dl_outer
+      dl_db1 = (2 * bond1.length - 2 * bond2.length * np.cos(gamma)) * dl_outer
+      dl_db2 = (2 * bond2.length - 2 * bond1.length * np.cos(gamma)) * dl_outer
+      stddev = np.sqrt((dl_dgamma * ba.stddev)**2 +
+                       (dl_db1 * bond1.stddev)**2 +
+                       (dl_db2 * bond2.stddev)**2)
+      residue_virtual_bonds[resname].append(
+          Bond(ba.atom1_name, ba.atom3name, length, stddev))
+  return (residue_bonds,
+          residue_virtual_bonds,
+          residue_bond_angles)
+# Between-residue bond lengths for general bonds (first element) and for Proline
+# (second element).
+between_res_bond_length_c_n = [1.329, 1.341]
+between_res_bond_length_stddev_c_n = [0.014, 0.016]
+# Between-residue cos_angles.
+between_res_cos_angles_c_n_ca = [-0.5203, 0.0353]  # degrees: 121.352 +- 2.315
+between_res_cos_angles_ca_c_n = [-0.4473, 0.0311]  # degrees: 116.568 +- 1.995
+# This mapping is used when we need to store atom data in a format that requires
+# fixed atom data size for every residue (e.g. a numpy array).
+atom_types = [
+    'N', 'CA', 'C', 'CB', 'O', 'CG', 'CG1', 'CG2', 'OG', 'OG1', 'SG', 'CD',
+    'CD1', 'CD2', 'ND1', 'ND2', 'OD1', 'OD2', 'SD', 'CE', 'CE1', 'CE2', 'CE3',
+    'NE', 'NE1', 'NE2', 'OE1', 'OE2', 'CH2', 'NH1', 'NH2', 'OH', 'CZ', 'CZ2',
+    'CZ3', 'NZ', 'OXT'
+]
+atom_order = {atom_type: i for i, atom_type in enumerate(atom_types)}
+atom_type_num = len(atom_types)  # := 37.
+# A compact atom encoding with 14 columns
+# pylint: disable=line-too-long
+# pylint: disable=bad-whitespace
+restype_name_to_atom14_names = {
+    'ALA': ['N', 'CA', 'C', 'O', 'CB', '',    '',    '',    '',    '',    '',    '',    '',    ''],
+    'ARG': ['N', 'CA', 'C', 'O', 'CB', 'CG',  'CD',  'NE',  'CZ',  'NH1', 'NH2', '',    '',    ''],
+    'ASN': ['N', 'CA', 'C', 'O', 'CB', 'CG',  'OD1', 'ND2', '',    '',    '',    '',    '',    ''],
+    'ASP': ['N', 'CA', 'C', 'O', 'CB', 'CG',  'OD1', 'OD2', '',    '',    '',    '',    '',    ''],
+    'CYS': ['N', 'CA', 'C', 'O', 'CB', 'SG',  '',    '',    '',    '',    '',    '',    '',    ''],
+    'GLN': ['N', 'CA', 'C', 'O', 'CB', 'CG',  'CD',  'OE1', 'NE2', '',    '',    '',    '',    ''],
+    'GLU': ['N', 'CA', 'C', 'O', 'CB', 'CG',  'CD',  'OE1', 'OE2', '',    '',    '',    '',    ''],
+    'GLY': ['N', 'CA', 'C', 'O', '',   '',    '',    '',    '',    '',    '',    '',    '',    ''],
+    'HIS': ['N', 'CA', 'C', 'O', 'CB', 'CG',  'ND1', 'CD2', 'CE1', 'NE2', '',    '',    '',    ''],
+    'ILE': ['N', 'CA', 'C', 'O', 'CB', 'CG1', 'CG2', 'CD1', '',    '',    '',    '',    '',    ''],
+    'LEU': ['N', 'CA', 'C', 'O', 'CB', 'CG',  'CD1', 'CD2', '',    '',    '',    '',    '',    ''],
+    'LYS': ['N', 'CA', 'C', 'O', 'CB', 'CG',  'CD',  'CE',  'NZ',  '',    '',    '',    '',    ''],
+    'MET': ['N', 'CA', 'C', 'O', 'CB', 'CG',  'SD',  'CE',  '',    '',    '',    '',    '',    ''],
+    'PHE': ['N', 'CA', 'C', 'O', 'CB', 'CG',  'CD1', 'CD2', 'CE1', 'CE2', 'CZ',  '',    '',    ''],
+    'PRO': ['N', 'CA', 'C', 'O', 'CB', 'CG',  'CD',  '',    '',    '',    '',    '',    '',    ''],
+    'SER': ['N', 'CA', 'C', 'O', 'CB', 'OG',  '',    '',    '',    '',    '',    '',    '',    ''],
+    'THR': ['N', 'CA', 'C', 'O', 'CB', 'OG1', 'CG2', '',    '',    '',    '',    '',    '',    ''],
+    'TRP': ['N', 'CA', 'C', 'O', 'CB', 'CG',  'CD1', 'CD2', 'NE1', 'CE2', 'CE3', 'CZ2', 'CZ3', 'CH2'],
+    'TYR': ['N', 'CA', 'C', 'O', 'CB', 'CG',  'CD1', 'CD2', 'CE1', 'CE2', 'CZ',  'OH',  '',    ''],
+    'VAL': ['N', 'CA', 'C', 'O', 'CB', 'CG1', 'CG2', '',    '',    '',    '',    '',    '',    ''],
+    'UNK': ['',  '',   '',  '',  '',   '',    '',    '',    '',    '',    '',    '',    '',    ''],
+}
+# pylint: enable=line-too-long
+# pylint: enable=bad-whitespace
+# This is the standard residue order when coding AA type as a number.
+# Reproduce it by taking 3-letter AA codes and sorting them alphabetically.
+restypes = [
+    'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P',
+    'S', 'T', 'W', 'Y', 'V'
+]
+restype_order = {restype: i for i, restype in enumerate(restypes)}
+restype_num = len(restypes)  # := 20.
+unk_restype_index = restype_num  # Catch-all index for unknown restypes.
+restypes_with_x = restypes + ['X']
+restype_order_with_x = {restype: i for i, restype in enumerate(restypes_with_x)}
+def sequence_to_onehot(
+    sequence: str,
+    mapping: Mapping[str, int],
+    map_unknown_to_x: bool = False) -> np.ndarray:
+  """Maps the given sequence into a one-hot encoded matrix.
+  Args:
+    sequence: An amino acid sequence.
+    mapping: A dictionary mapping amino acids to integers.
+    map_unknown_to_x: If True, any amino acid that is not in the mapping will be
+      mapped to the unknown amino acid 'X'. If the mapping doesn't contain
+      amino acid 'X', an error will be thrown. If False, any amino acid not in
+      the mapping will throw an error.
+  Returns:
+    A numpy array of shape (seq_len, num_unique_aas) with one-hot encoding of
+    the sequence.
+  Raises:
+    ValueError: If the mapping doesn't contain values from 0 to
+      num_unique_aas - 1 without any gaps.
+  """
+  num_entries = max(mapping.values()) + 1
+  if sorted(set(mapping.values())) != list(range(num_entries)):
+    raise ValueError('The mapping must have values from 0 to num_unique_aas-1 '
+                     'without any gaps. Got: %s' % sorted(mapping.values()))
+  one_hot_arr = np.zeros((len(sequence), num_entries), dtype=int)
+  for aa_index, aa_type in enumerate(sequence):
+    if map_unknown_to_x:
+      if aa_type.isalpha() and aa_type.isupper():
+        aa_id = mapping.get(aa_type, mapping['X'])
+      else:
+        raise ValueError(f'Invalid character in the sequence: {aa_type}')
+    else:
+      aa_id = mapping[aa_type]
+    one_hot_arr[aa_index, aa_id] = 1
+  return one_hot_arr
+restype_1to3 = {
+    'A': 'ALA',
+    'R': 'ARG',
+    'N': 'ASN',
+    'D': 'ASP',
+    'C': 'CYS',
+    'Q': 'GLN',
+    'E': 'GLU',
+    'G': 'GLY',
+    'H': 'HIS',
+    'I': 'ILE',
+    'L': 'LEU',
+    'K': 'LYS',
+    'M': 'MET',
+    'F': 'PHE',
+    'P': 'PRO',
+    'S': 'SER',
+    'T': 'THR',
+    'W': 'TRP',
+    'Y': 'TYR',
+    'V': 'VAL',
+}
+# NB: restype_3to1 differs from Bio.PDB.protein_letters_3to1 by being a simple
+# 1-to-1 mapping of 3 letter names to one letter names. The latter contains
+# many more, and less common, three letter names as keys and maps many of these
+# to the same one letter name (including 'X' and 'U' which we don't use here).
+restype_3to1 = {v: k for k, v in restype_1to3.items()}
+# Define a restype name for all unknown residues.
+unk_restype = 'UNK'
+resnames = [restype_1to3[r] for r in restypes] + [unk_restype]
+resname_to_idx = {resname: i for i, resname in enumerate(resnames)}
+# The mapping here uses hhblits convention, so that B is mapped to D, J and O
+# are mapped to X, U is mapped to C, and Z is mapped to E. Other than that the
+# remaining 20 amino acids are kept in alphabetical order.
+# There are 2 non-amino acid codes, X (representing any amino acid) and
+# "-" representing a missing amino acid in an alignment.  The id for these
+# codes is put at the end (20 and 21) so that they can easily be ignored if
+# desired.
+HHBLITS_AA_TO_ID = {
+    'A': 0,
+    'B': 2,
+    'C': 1,
+    'D': 2,
+    'E': 3,
+    'F': 4,
+    'G': 5,
+    'H': 6,
+    'I': 7,
+    'J': 20,
+    'K': 8,
+    'L': 9,
+    'M': 10,
+    'N': 11,
+    'O': 20,
+    'P': 12,
+    'Q': 13,
+    'R': 14,
+    'S': 15,
+    'T': 16,
+    'U': 1,
+    'V': 17,
+    'W': 18,
+    'X': 20,
+    'Y': 19,
+    'Z': 3,
+    '-': 21,
+}
+# Partial inversion of HHBLITS_AA_TO_ID.
+ID_TO_HHBLITS_AA = {
+    0: 'A',
+    1: 'C',  # Also U.
+    2: 'D',  # Also B.
+    3: 'E',  # Also Z.
+    4: 'F',
+    5: 'G',
+    6: 'H',
+    7: 'I',
+    8: 'K',
+    9: 'L',
+    10: 'M',
+    11: 'N',
+    12: 'P',
+    13: 'Q',
+    14: 'R',
+    15: 'S',
+    16: 'T',
+    17: 'V',
+    18: 'W',
+    19: 'Y',
+    20: 'X',  # Includes J and O.
+    21: '-',
+}
+restypes_with_x_and_gap = restypes + ['X', '-']
+MAP_HHBLITS_AATYPE_TO_OUR_AATYPE = tuple(
+    restypes_with_x_and_gap.index(ID_TO_HHBLITS_AA[i])
+    for i in range(len(restypes_with_x_and_gap)))
+def _make_standard_atom_mask() -> np.ndarray:
+  """Returns [num_res_types, num_atom_types] mask array."""
+  # +1 to account for unknown (all 0s).
+  mask = np.zeros([restype_num + 1, atom_type_num], dtype=int)
+  for restype, restype_letter in enumerate(restypes):
+    restype_name = restype_1to3[restype_letter]
+    atom_names = residue_atoms[restype_name]
+    for atom_name in atom_names:
+      atom_type = atom_order[atom_name]
+      mask[restype, atom_type] = 1
+  return mask
+STANDARD_ATOM_MASK = _make_standard_atom_mask()
+# A one hot representation for the first and second atoms defining the axis
+# of rotation for each chi-angle in each residue.
+def chi_angle_atom(atom_index: int) -> np.ndarray:
+  """Define chi-angle rigid groups via one-hot representations."""
+  chi_angles_index = {}
+  one_hots = []
+  for k, v in chi_angles_atoms.items():
+    indices = [atom_types.index(s[atom_index]) for s in v]
+    indices.extend([-1]*(4-len(indices)))
+    chi_angles_index[k] = indices
+  for r in restypes:
+    res3 = restype_1to3[r]
+    one_hot = np.eye(atom_type_num)[chi_angles_index[res3]]
+    one_hots.append(one_hot)
+  one_hots.append(np.zeros([4, atom_type_num]))  # Add zeros for residue `X`.
+  one_hot = np.stack(one_hots, axis=0)
+  one_hot = np.transpose(one_hot, [0, 2, 1])
+  return one_hot
+chi_atom_1_one_hot = chi_angle_atom(1)
+chi_atom_2_one_hot = chi_angle_atom(2)
+# An array like chi_angles_atoms but using indices rather than names.
+chi_angles_atom_indices = [chi_angles_atoms[restype_1to3[r]] for r in restypes]
+chi_angles_atom_indices = tree.map_structure(
+    lambda atom_name: atom_order[atom_name], chi_angles_atom_indices)
+chi_angles_atom_indices = np.array([
+    chi_atoms + ([[0, 0, 0, 0]] * (4 - len(chi_atoms)))
+    for chi_atoms in chi_angles_atom_indices])
+# Mapping from (res_name, atom_name) pairs to the atom's chi group index
+# and atom index within that group.
+chi_groups_for_atom = collections.defaultdict(list)
+for res_name, chi_angle_atoms_for_res in chi_angles_atoms.items():
+  for chi_group_i, chi_group in enumerate(chi_angle_atoms_for_res):
+    for atom_i, atom in enumerate(chi_group):
+      chi_groups_for_atom[(res_name, atom)].append((chi_group_i, atom_i))
+chi_groups_for_atom = dict(chi_groups_for_atom)
+def _make_rigid_transformation_4x4(ex, ey, translation):
+  """Create a rigid 4x4 transformation matrix from two axes and transl."""
+  # Normalize ex.
+  ex_normalized = ex / np.linalg.norm(ex)
+  # make ey perpendicular to ex
+  ey_normalized = ey - np.dot(ey, ex_normalized) * ex_normalized
+  ey_normalized /= np.linalg.norm(ey_normalized)
+  # compute ez as cross product
+  eznorm = np.cross(ex_normalized, ey_normalized)
+  m = np.stack([ex_normalized, ey_normalized, eznorm, translation]).transpose()
+  m = np.concatenate([m, [[0., 0., 0., 1.]]], axis=0)
+  return m
+# create an array with (restype, atomtype) --> rigid_group_idx
+# and an array with (restype, atomtype, coord) for the atom positions
+# and compute affine transformation matrices (4,4) from one rigid group to the
+# previous group
+restype_atom37_to_rigid_group = np.zeros([21, 37], dtype=int)
+restype_atom37_mask = np.zeros([21, 37], dtype=np.float32)
+restype_atom37_rigid_group_positions = np.zeros([21, 37, 3], dtype=np.float32)
+restype_atom14_to_rigid_group = np.zeros([21, 14], dtype=int)
+restype_atom14_mask = np.zeros([21, 14], dtype=np.float32)
+restype_atom14_rigid_group_positions = np.zeros([21, 14, 3], dtype=np.float32)
+restype_rigid_group_default_frame = np.zeros([21, 8, 4, 4], dtype=np.float32)
+def _make_rigid_group_constants():
+  """Fill the arrays above."""
+  for restype, restype_letter in enumerate(restypes):
+    resname = restype_1to3[restype_letter]
+    for atomname, group_idx, atom_position in rigid_group_atom_positions[
+        resname]:
+      atomtype = atom_order[atomname]
+      restype_atom37_to_rigid_group[restype, atomtype] = group_idx
+      restype_atom37_mask[restype, atomtype] = 1
+      restype_atom37_rigid_group_positions[restype, atomtype, :] = atom_position
+      atom14idx = restype_name_to_atom14_names[resname].index(atomname)
+      restype_atom14_to_rigid_group[restype, atom14idx] = group_idx
+      restype_atom14_mask[restype, atom14idx] = 1
+      restype_atom14_rigid_group_positions[restype,
+                                           atom14idx, :] = atom_position
+  for restype, restype_letter in enumerate(restypes):
+    resname = restype_1to3[restype_letter]
+    atom_positions = {name: np.array(pos) for name, _, pos
+                      in rigid_group_atom_positions[resname]}
+    # backbone to backbone is the identity transform
+    restype_rigid_group_default_frame[restype, 0, :, :] = np.eye(4)
+    # pre-omega-frame to backbone (currently dummy identity matrix)
+    restype_rigid_group_default_frame[restype, 1, :, :] = np.eye(4)
+    # phi-frame to backbone
+    mat = _make_rigid_transformation_4x4(
+        ex=atom_positions['N'] - atom_positions['CA'],
+        ey=np.array([1., 0., 0.]),
+        translation=atom_positions['N'])
+    restype_rigid_group_default_frame[restype, 2, :, :] = mat
+    # psi-frame to backbone
+    mat = _make_rigid_transformation_4x4(
+        ex=atom_positions['C'] - atom_positions['CA'],
+        ey=atom_positions['CA'] - atom_positions['N'],
+        translation=atom_positions['C'])
+    restype_rigid_group_default_frame[restype, 3, :, :] = mat
+    # chi1-frame to backbone
+    if chi_angles_mask[restype][0]:
+      base_atom_names = chi_angles_atoms[resname][0]
+      base_atom_positions = [atom_positions[name] for name in base_atom_names]
+      mat = _make_rigid_transformation_4x4(
+          ex=base_atom_positions[2] - base_atom_positions[1],
+          ey=base_atom_positions[0] - base_atom_positions[1],
+          translation=base_atom_positions[2])
+      restype_rigid_group_default_frame[restype, 4, :, :] = mat
+    # chi2-frame to chi1-frame
+    # chi3-frame to chi2-frame
+    # chi4-frame to chi3-frame
+    # luckily all rotation axes for the next frame start at (0,0,0) of the
+    # previous frame
+    for chi_idx in range(1, 4):
+      if chi_angles_mask[restype][chi_idx]:
+        axis_end_atom_name = chi_angles_atoms[resname][chi_idx][2]
+        axis_end_atom_position = atom_positions[axis_end_atom_name]
+        mat = _make_rigid_transformation_4x4(
+            ex=axis_end_atom_position,
+            ey=np.array([-1., 0., 0.]),
+            translation=axis_end_atom_position)
+        restype_rigid_group_default_frame[restype, 4 + chi_idx, :, :] = mat
+_make_rigid_group_constants()
+def make_atom14_dists_bounds(overlap_tolerance=1.5,
+                             bond_length_tolerance_factor=15):
+  """compute upper and lower bounds for bonds to assess violations."""
+  restype_atom14_bond_lower_bound = np.zeros([21, 14, 14], np.float32)
+  restype_atom14_bond_upper_bound = np.zeros([21, 14, 14], np.float32)
+  restype_atom14_bond_stddev = np.zeros([21, 14, 14], np.float32)
+  residue_bonds, residue_virtual_bonds, _ = load_stereo_chemical_props()
+  for restype, restype_letter in enumerate(restypes):
+    resname = restype_1to3[restype_letter]
+    atom_list = restype_name_to_atom14_names[resname]
+    # create lower and upper bounds for clashes
+    for atom1_idx, atom1_name in enumerate(atom_list):
+      if not atom1_name:
+        continue
+      atom1_radius = van_der_waals_radius[atom1_name[0]]
+      for atom2_idx, atom2_name in enumerate(atom_list):
+        if (not atom2_name) or atom1_idx == atom2_idx:
+          continue
+        atom2_radius = van_der_waals_radius[atom2_name[0]]
+        lower = atom1_radius + atom2_radius - overlap_tolerance
+        upper = 1e10
+        restype_atom14_bond_lower_bound[restype, atom1_idx, atom2_idx] = lower
+        restype_atom14_bond_lower_bound[restype, atom2_idx, atom1_idx] = lower
+        restype_atom14_bond_upper_bound[restype, atom1_idx, atom2_idx] = upper
+        restype_atom14_bond_upper_bound[restype, atom2_idx, atom1_idx] = upper
+    # overwrite lower and upper bounds for bonds and angles
+    for b in residue_bonds[resname] + residue_virtual_bonds[resname]:
+      atom1_idx = atom_list.index(b.atom1_name)
+      atom2_idx = atom_list.index(b.atom2_name)
+      lower = b.length - bond_length_tolerance_factor * b.stddev
+      upper = b.length + bond_length_tolerance_factor * b.stddev
+      restype_atom14_bond_lower_bound[restype, atom1_idx, atom2_idx] = lower
+      restype_atom14_bond_lower_bound[restype, atom2_idx, atom1_idx] = lower
+      restype_atom14_bond_upper_bound[restype, atom1_idx, atom2_idx] = upper
+      restype_atom14_bond_upper_bound[restype, atom2_idx, atom1_idx] = upper
+      restype_atom14_bond_stddev[restype, atom1_idx, atom2_idx] = b.stddev
+      restype_atom14_bond_stddev[restype, atom2_idx, atom1_idx] = b.stddev
+  return {'lower_bound': restype_atom14_bond_lower_bound,  # shape (21,14,14)
+          'upper_bound': restype_atom14_bond_upper_bound,  # shape (21,14,14)
+          'stddev': restype_atom14_bond_stddev,  # shape (21,14,14)
+         }

analysis/src/common/rigid_utils.py ADDED Viewed

	@@ -0,0 +1,1451 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Tuple, Any, Sequence, Callable, Optional
+import numpy as np
+import torch
+from src.common import rotation3d
+def rot_matmul(
+    a: torch.Tensor,
+    b: torch.Tensor
+) -> torch.Tensor:
+    """
+        Performs matrix multiplication of two rotation matrix tensors. Written
+        out by hand to avoid AMP downcasting.
+        Args:
+            a: [*, 3, 3] left multiplicand
+            b: [*, 3, 3] right multiplicand
+        Returns:
+            The product ab
+    """
+    row_1 = torch.stack(
+        [
+            a[..., 0, 0] * b[..., 0, 0]
+            + a[..., 0, 1] * b[..., 1, 0]
+            + a[..., 0, 2] * b[..., 2, 0],
+            a[..., 0, 0] * b[..., 0, 1]
+            + a[..., 0, 1] * b[..., 1, 1]
+            + a[..., 0, 2] * b[..., 2, 1],
+            a[..., 0, 0] * b[..., 0, 2]
+            + a[..., 0, 1] * b[..., 1, 2]
+            + a[..., 0, 2] * b[..., 2, 2],
+        ],
+        dim=-1,
+    )
+    row_2 = torch.stack(
+        [
+            a[..., 1, 0] * b[..., 0, 0]
+            + a[..., 1, 1] * b[..., 1, 0]
+            + a[..., 1, 2] * b[..., 2, 0],
+            a[..., 1, 0] * b[..., 0, 1]
+            + a[..., 1, 1] * b[..., 1, 1]
+            + a[..., 1, 2] * b[..., 2, 1],
+            a[..., 1, 0] * b[..., 0, 2]
+            + a[..., 1, 1] * b[..., 1, 2]
+            + a[..., 1, 2] * b[..., 2, 2],
+        ],
+        dim=-1,
+    )
+    row_3 = torch.stack(
+        [
+            a[..., 2, 0] * b[..., 0, 0]
+            + a[..., 2, 1] * b[..., 1, 0]
+            + a[..., 2, 2] * b[..., 2, 0],
+            a[..., 2, 0] * b[..., 0, 1]
+            + a[..., 2, 1] * b[..., 1, 1]
+            + a[..., 2, 2] * b[..., 2, 1],
+            a[..., 2, 0] * b[..., 0, 2]
+            + a[..., 2, 1] * b[..., 1, 2]
+            + a[..., 2, 2] * b[..., 2, 2],
+        ],
+        dim=-1,
+    )
+    return torch.stack([row_1, row_2, row_3], dim=-2)
+def rot_vec_mul(
+    r: torch.Tensor,
+    t: torch.Tensor
+) -> torch.Tensor:
+    """
+        Applies a rotation to a vector. Written out by hand to avoid transfer
+        to avoid AMP downcasting.
+        Args:
+            r: [*, 3, 3] rotation matrices
+            t: [*, 3] coordinate tensors
+        Returns:
+            [*, 3] rotated coordinates
+    """
+    x = t[..., 0]
+    y = t[..., 1]
+    z = t[..., 2]
+    return torch.stack(
+        [
+            r[..., 0, 0] * x + r[..., 0, 1] * y + r[..., 0, 2] * z,
+            r[..., 1, 0] * x + r[..., 1, 1] * y + r[..., 1, 2] * z,
+            r[..., 2, 0] * x + r[..., 2, 1] * y + r[..., 2, 2] * z,
+        ],
+        dim=-1,
+    )
+def identity_rot_mats(
+    batch_dims: Tuple[int],
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[torch.device] = None,
+    requires_grad: bool = True,
+) -> torch.Tensor:
+    rots = torch.eye(
+        3, dtype=dtype, device=device, requires_grad=requires_grad
+    )
+    rots = rots.view(*((1,) * len(batch_dims)), 3, 3)
+    rots = rots.expand(*batch_dims, -1, -1)
+    return rots
+def identity_trans(
+    batch_dims: Tuple[int],
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[torch.device] = None,
+    requires_grad: bool = True,
+) -> torch.Tensor:
+    trans = torch.zeros(
+        (*batch_dims, 3),
+        dtype=dtype,
+        device=device,
+        requires_grad=requires_grad
+    )
+    return trans
+def identity_quats(
+    batch_dims: Tuple[int],
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[torch.device] = None,
+    requires_grad: bool = True,
+) -> torch.Tensor:
+    quat = torch.zeros(
+        (*batch_dims, 4),
+        dtype=dtype,
+        device=device,
+        requires_grad=requires_grad
+    )
+    with torch.no_grad():
+        quat[..., 0] = 1
+    return quat
+_quat_elements = ["a", "b", "c", "d"]
+_qtr_keys = [l1 + l2 for l1 in _quat_elements for l2 in _quat_elements]
+_qtr_ind_dict = {key: ind for ind, key in enumerate(_qtr_keys)}
+def _to_mat(pairs):
+    mat = np.zeros((4, 4))
+    for pair in pairs:
+        key, value = pair
+        ind = _qtr_ind_dict[key]
+        mat[ind // 4][ind % 4] = value
+    return mat
+_QTR_MAT = np.zeros((4, 4, 3, 3))
+_QTR_MAT[..., 0, 0] = _to_mat([("aa", 1), ("bb", 1), ("cc", -1), ("dd", -1)])
+_QTR_MAT[..., 0, 1] = _to_mat([("bc", 2), ("ad", -2)])
+_QTR_MAT[..., 0, 2] = _to_mat([("bd", 2), ("ac", 2)])
+_QTR_MAT[..., 1, 0] = _to_mat([("bc", 2), ("ad", 2)])
+_QTR_MAT[..., 1, 1] = _to_mat([("aa", 1), ("bb", -1), ("cc", 1), ("dd", -1)])
+_QTR_MAT[..., 1, 2] = _to_mat([("cd", 2), ("ab", -2)])
+_QTR_MAT[..., 2, 0] = _to_mat([("bd", 2), ("ac", -2)])
+_QTR_MAT[..., 2, 1] = _to_mat([("cd", 2), ("ab", 2)])
+_QTR_MAT[..., 2, 2] = _to_mat([("aa", 1), ("bb", -1), ("cc", -1), ("dd", 1)])
+def quat_to_rot(quat: torch.Tensor) -> torch.Tensor:
+    """
+        Converts a quaternion to a rotation matrix.
+        Args:
+            quat: [*, 4] quaternions
+        Returns:
+            [*, 3, 3] rotation matrices
+    """
+    # [*, 4, 4]
+    quat = quat[..., None] * quat[..., None, :]
+    # [4, 4, 3, 3]
+    mat = quat.new_tensor(_QTR_MAT, requires_grad=False)
+    # [*, 4, 4, 3, 3]
+    shaped_qtr_mat = mat.view((1,) * len(quat.shape[:-2]) + mat.shape)
+    quat = quat[..., None, None] * shaped_qtr_mat
+    # [*, 3, 3]
+    return torch.sum(quat, dim=(-3, -4))
+def rot_to_quat(
+    rot: torch.Tensor,
+):
+    if(rot.shape[-2:] != (3, 3)):
+        raise ValueError("Input rotation is incorrectly shaped")
+    rot = [[rot[..., i, j] for j in range(3)] for i in range(3)]
+    [[xx, xy, xz], [yx, yy, yz], [zx, zy, zz]] = rot
+    k = [
+        [ xx + yy + zz,      zy - yz,      xz - zx,      yx - xy,],
+        [      zy - yz, xx - yy - zz,      xy + yx,      xz + zx,],
+        [      xz - zx,      xy + yx, yy - xx - zz,      yz + zy,],
+        [      yx - xy,      xz + zx,      yz + zy, zz - xx - yy,]
+    ]
+    k = (1./3.) * torch.stack([torch.stack(t, dim=-1) for t in k], dim=-2)
+    _, vectors = torch.linalg.eigh(k)
+    return vectors[..., -1]
+_QUAT_MULTIPLY = np.zeros((4, 4, 4))
+_QUAT_MULTIPLY[:, :, 0] = [[ 1, 0, 0, 0],
+                          [ 0,-1, 0, 0],
+                          [ 0, 0,-1, 0],
+                          [ 0, 0, 0,-1]]
+_QUAT_MULTIPLY[:, :, 1] = [[ 0, 1, 0, 0],
+                          [ 1, 0, 0, 0],
+                          [ 0, 0, 0, 1],
+                          [ 0, 0,-1, 0]]
+_QUAT_MULTIPLY[:, :, 2] = [[ 0, 0, 1, 0],
+                          [ 0, 0, 0,-1],
+                          [ 1, 0, 0, 0],
+                          [ 0, 1, 0, 0]]
+_QUAT_MULTIPLY[:, :, 3] = [[ 0, 0, 0, 1],
+                          [ 0, 0, 1, 0],
+                          [ 0,-1, 0, 0],
+                          [ 1, 0, 0, 0]]
+_QUAT_MULTIPLY_BY_VEC = _QUAT_MULTIPLY[:, 1:, :]
+def quat_multiply(quat1, quat2):
+    """Multiply a quaternion by another quaternion."""
+    mat = quat1.new_tensor(_QUAT_MULTIPLY)
+    reshaped_mat = mat.view((1,) * len(quat1.shape[:-1]) + mat.shape)
+    return torch.sum(
+        reshaped_mat *
+        quat1[..., :, None, None] *
+        quat2[..., None, :, None],
+        dim=(-3, -2)
+      )
+def quat_multiply_by_vec(quat, vec):
+    """Multiply a quaternion by a pure-vector quaternion."""
+    mat = quat.new_tensor(_QUAT_MULTIPLY_BY_VEC)
+    reshaped_mat = mat.view((1,) * len(quat.shape[:-1]) + mat.shape)
+    return torch.sum(
+        reshaped_mat *
+        quat[..., :, None, None] *
+        vec[..., None, :, None],
+        dim=(-3, -2)
+    )
+def invert_rot_mat(rot_mat: torch.Tensor):
+    return rot_mat.transpose(-1, -2)
+def invert_quat(quat: torch.Tensor):
+    quat_prime = quat.clone()
+    quat_prime[..., 1:] *= -1
+    inv = quat_prime / torch.sum(quat ** 2, dim=-1, keepdim=True)
+    return inv
+class Rotation:
+    """
+        A 3D rotation. Depending on how the object is initialized, the
+        rotation is represented by either a rotation matrix or a
+        quaternion, though both formats are made available by helper functions.
+        To simplify gradient computation, the underlying format of the
+        rotation cannot be changed in-place. Like Rigid, the class is designed
+        to mimic the behavior of a torch Tensor, almost as if each Rotation
+        object were a tensor of rotations, in one format or another.
+    """
+    def __init__(self,
+        rot_mats: Optional[torch.Tensor] = None,
+        quats: Optional[torch.Tensor] = None,
+        normalize_quats: bool = True,
+    ):
+        """
+            Args:
+                rot_mats:
+                    A [*, 3, 3] rotation matrix tensor. Mutually exclusive with
+                    quats
+                quats:
+                    A [*, 4] quaternion. Mutually exclusive with rot_mats. If
+                    normalize_quats is not True, must be a unit quaternion
+                normalize_quats:
+                    If quats is specified, whether to normalize quats
+        """
+        if((rot_mats is None and quats is None) or
+            (rot_mats is not None and quats is not None)):
+            raise ValueError("Exactly one input argument must be specified")
+        if((rot_mats is not None and rot_mats.shape[-2:] != (3, 3)) or
+            (quats is not None and quats.shape[-1] != 4)):
+            raise ValueError(
+                "Incorrectly shaped rotation matrix or quaternion"
+            )
+        # Force full-precision
+        if(quats is not None):
+            quats = quats.type(torch.float32)
+        if(rot_mats is not None):
+            rot_mats = rot_mats.type(torch.float32)
+        if(quats is not None and normalize_quats):
+            quats = quats / torch.linalg.norm(quats, dim=-1, keepdim=True)
+        self._rot_mats = rot_mats
+        self._quats = quats
+    @staticmethod
+    def identity(
+        shape,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        requires_grad: bool = True,
+        fmt: str = "quat",
+    ):
+        """
+            Returns an identity Rotation.
+            Args:
+                shape:
+                    The "shape" of the resulting Rotation object. See documentation
+                    for the shape property
+                dtype:
+                    The torch dtype for the rotation
+                device:
+                    The torch device for the new rotation
+                requires_grad:
+                    Whether the underlying tensors in the new rotation object
+                    should require gradient computation
+                fmt:
+                    One of "quat" or "rot_mat". Determines the underlying format
+                    of the new object's rotation
+            Returns:
+                A new identity rotation
+        """
+        if(fmt == "rot_mat"):
+            rot_mats = identity_rot_mats(
+                shape, dtype, device, requires_grad,
+            )
+            return Rotation(rot_mats=rot_mats, quats=None)
+        elif(fmt == "quat"):
+            quats = identity_quats(shape, dtype, device, requires_grad)
+            return Rotation(rot_mats=None, quats=quats, normalize_quats=False)
+        else:
+            raise ValueError(f"Invalid format: f{fmt}")
+    # Magic methods
+    def __getitem__(self, index: Any):
+        """
+            Allows torch-style indexing over the virtual shape of the rotation
+            object. See documentation for the shape property.
+            Args:
+                index:
+                    A torch index. E.g. (1, 3, 2), or (slice(None,))
+            Returns:
+                The indexed rotation
+        """
+        if type(index) != tuple:
+            index = (index,)
+        if(self._rot_mats is not None):
+            rot_mats = self._rot_mats[index + (slice(None), slice(None))]
+            return Rotation(rot_mats=rot_mats)
+        elif(self._quats is not None):
+            quats = self._quats[index + (slice(None),)]
+            return Rotation(quats=quats, normalize_quats=False)
+        else:
+            raise ValueError("Both rotations are None")
+    def __mul__(self,
+        right: torch.Tensor,
+    ):
+        """
+            Pointwise left multiplication of the rotation with a tensor. Can be
+            used to e.g. mask the Rotation.
+            Args:
+                right:
+                    The tensor multiplicand
+            Returns:
+                The product
+        """
+        if not(isinstance(right, torch.Tensor)):
+            raise TypeError("The other multiplicand must be a Tensor")
+        if(self._rot_mats is not None):
+            rot_mats = self._rot_mats * right[..., None, None]
+            return Rotation(rot_mats=rot_mats, quats=None)
+        elif(self._quats is not None):
+            quats = self._quats * right[..., None]
+            return Rotation(rot_mats=None, quats=quats, normalize_quats=False)
+        else:
+            raise ValueError("Both rotations are None")
+    def __rmul__(self,
+        left: torch.Tensor,
+    ):
+        """
+            Reverse pointwise multiplication of the rotation with a tensor.
+            Args:
+                left:
+                    The left multiplicand
+            Returns:
+                The product
+        """
+        return self.__mul__(left)
+    # Properties
+    @property
+    def shape(self) -> torch.Size:
+        """
+            Returns the virtual shape of the rotation object. This shape is
+            defined as the batch dimensions of the underlying rotation matrix
+            or quaternion. If the Rotation was initialized with a [10, 3, 3]
+            rotation matrix tensor, for example, the resulting shape would be
+            [10].
+            Returns:
+                The virtual shape of the rotation object
+        """
+        s = None
+        if(self._quats is not None):
+            s = self._quats.shape[:-1]
+        else:
+            s = self._rot_mats.shape[:-2]
+        return s
+    @property
+    def dtype(self) -> torch.dtype:
+        """
+            Returns the dtype of the underlying rotation.
+            Returns:
+                The dtype of the underlying rotation
+        """
+        if(self._rot_mats is not None):
+            return self._rot_mats.dtype
+        elif(self._quats is not None):
+            return self._quats.dtype
+        else:
+            raise ValueError("Both rotations are None")
+    @property
+    def device(self) -> torch.device:
+        """
+            The device of the underlying rotation
+            Returns:
+                The device of the underlying rotation
+        """
+        if(self._rot_mats is not None):
+            return self._rot_mats.device
+        elif(self._quats is not None):
+            return self._quats.device
+        else:
+            raise ValueError("Both rotations are None")
+    @property
+    def requires_grad(self) -> bool:
+        """
+            Returns the requires_grad property of the underlying rotation
+            Returns:
+                The requires_grad property of the underlying tensor
+        """
+        if(self._rot_mats is not None):
+            return self._rot_mats.requires_grad
+        elif(self._quats is not None):
+            return self._quats.requires_grad
+        else:
+            raise ValueError("Both rotations are None")
+    def get_rot_mats(self) -> torch.Tensor:
+        """
+            Returns the underlying rotation as a rotation matrix tensor.
+            Returns:
+                The rotation as a rotation matrix tensor
+        """
+        rot_mats = self._rot_mats
+        if(rot_mats is None):
+            if(self._quats is None):
+                raise ValueError("Both rotations are None")
+            else:
+                rot_mats = quat_to_rot(self._quats)
+        return rot_mats
+    def get_quats(self) -> torch.Tensor:
+        """
+            Returns the underlying rotation as a quaternion tensor.
+            Depending on whether the Rotation was initialized with a
+            quaternion, this function may call torch.linalg.eigh.
+            Returns:
+                The rotation as a quaternion tensor.
+        """
+        quats = self._quats
+        if(quats is None):
+            if(self._rot_mats is None):
+                raise ValueError("Both rotations are None")
+            else:
+                # quats = rot_to_quat(self._rot_mats)
+                quats = rotation3d.matrix_to_quaternion(self._rot_mats)
+        return quats
+    def get_cur_rot(self) -> torch.Tensor:
+        """
+            Return the underlying rotation in its current form
+            Returns:
+                The stored rotation
+        """
+        if(self._rot_mats is not None):
+            return self._rot_mats
+        elif(self._quats is not None):
+            return self._quats
+        else:
+            raise ValueError("Both rotations are None")
+    def get_rotvec(self, eps=1e-6) -> torch.Tensor:
+        """
+            Return the underlying axis-angle rotation vector.
+            Follow's scipy's implementation:
+            https://github.com/scipy/scipy/blob/HEAD/scipy/spatial/transform/_rotation.pyx#L1385-L1402
+            Returns:
+                The stored rotation as a axis-angle vector.
+        """
+        quat = self.get_quats()
+        # w > 0 to ensure 0 <= angle <= pi
+        flip = (quat[..., :1] < 0).float()
+        quat = (-1 * quat) * flip + (1 - flip) * quat
+        angle = 2 * torch.atan2(
+            torch.linalg.norm(quat[..., 1:], dim=-1),
+            quat[..., 0]
+        )
+        angle2 = angle * angle
+        small_angle_scales = 2 + angle2 / 12 + 7 * angle2 * angle2 / 2880
+        large_angle_scales = angle / torch.sin(angle / 2 + eps)
+        small_angles = (angle <= 1e-3).float()
+        rot_vec_scale = small_angle_scales * small_angles + (1 - small_angles) * large_angle_scales
+        rot_vec = rot_vec_scale[..., None] * quat[..., 1:]
+        return rot_vec
+    # Rotation functions
+    def compose_q_update_vec(self,
+        q_update_vec: torch.Tensor,
+        normalize_quats: bool = True,
+        update_mask: torch.Tensor = None,
+    ):
+        """
+            Returns a new quaternion Rotation after updating the current
+            object's underlying rotation with a quaternion update, formatted
+            as a [*, 3] tensor whose final three columns represent x, y, z such
+            that (1, x, y, z) is the desired (not necessarily unit) quaternion
+            update.
+            Args:
+                q_update_vec:
+                    A [*, 3] quaternion update tensor
+                normalize_quats:
+                    Whether to normalize the output quaternion
+            Returns:
+                An updated Rotation
+        """
+        quats = self.get_quats()
+        quat_update = quat_multiply_by_vec(quats, q_update_vec)
+        if update_mask is not None:
+            quat_update = quat_update * update_mask
+        new_quats = quats + quat_update
+        return Rotation(
+            rot_mats=None,
+            quats=new_quats,
+            normalize_quats=normalize_quats,
+        )
+    def compose_r(self, r):
+        """
+            Compose the rotation matrices of the current Rotation object with
+            those of another.
+            Args:
+                r:
+                    An update rotation object
+            Returns:
+                An updated rotation object
+        """
+        r1 = self.get_rot_mats()
+        r2 = r.get_rot_mats()
+        new_rot_mats = rot_matmul(r1, r2)
+        return Rotation(rot_mats=new_rot_mats, quats=None)
+    def compose_q(self, r, normalize_quats: bool = True):
+        """
+            Compose the quaternions of the current Rotation object with those
+            of another.
+            Depending on whether either Rotation was initialized with
+            quaternions, this function may call torch.linalg.eigh.
+            Args:
+                r:
+                    An update rotation object
+            Returns:
+                An updated rotation object
+        """
+        q1 = self.get_quats()
+        q2 = r.get_quats()
+        new_quats = quat_multiply(q1, q2)
+        return Rotation(
+            rot_mats=None, quats=new_quats, normalize_quats=normalize_quats
+        )
+    def apply(self, pts: torch.Tensor) -> torch.Tensor:
+        """
+            Apply the current Rotation as a rotation matrix to a set of 3D
+            coordinates.
+            Args:
+                pts:
+                    A [*, 3] set of points
+            Returns:
+                [*, 3] rotated points
+        """
+        rot_mats = self.get_rot_mats()
+        return rot_vec_mul(rot_mats, pts)
+    def invert_apply(self, pts: torch.Tensor) -> torch.Tensor:
+        """
+            The inverse of the apply() method.
+            Args:
+                pts:
+                    A [*, 3] set of points
+            Returns:
+                [*, 3] inverse-rotated points
+        """
+        rot_mats = self.get_rot_mats()
+        inv_rot_mats = invert_rot_mat(rot_mats)
+        return rot_vec_mul(inv_rot_mats, pts)
+    def invert(self) :
+        """
+            Returns the inverse of the current Rotation.
+            Returns:
+                The inverse of the current Rotation
+        """
+        if(self._rot_mats is not None):
+            return Rotation(
+                rot_mats=invert_rot_mat(self._rot_mats),
+                quats=None
+            )
+        elif(self._quats is not None):
+            return Rotation(
+                rot_mats=None,
+                quats=invert_quat(self._quats),
+                normalize_quats=False,
+            )
+        else:
+            raise ValueError("Both rotations are None")
+    # "Tensor" stuff
+    def unsqueeze(self,
+        dim: int,
+    ):
+        """
+            Analogous to torch.unsqueeze. The dimension is relative to the
+            shape of the Rotation object.
+            Args:
+                dim: A positive or negative dimension index.
+            Returns:
+                The unsqueezed Rotation.
+        """
+        if dim >= len(self.shape):
+            raise ValueError("Invalid dimension")
+        if(self._rot_mats is not None):
+            rot_mats = self._rot_mats.unsqueeze(dim if dim >= 0 else dim - 2)
+            return Rotation(rot_mats=rot_mats, quats=None)
+        elif(self._quats is not None):
+            quats = self._quats.unsqueeze(dim if dim >= 0 else dim - 1)
+            return Rotation(rot_mats=None, quats=quats, normalize_quats=False)
+        else:
+            raise ValueError("Both rotations are None")
+    @staticmethod
+    def cat(
+        rs,
+        dim: int,
+    ):
+        """
+            Concatenates rotations along one of the batch dimensions. Analogous
+            to torch.cat().
+            Note that the output of this operation is always a rotation matrix,
+            regardless of the format of input rotations.
+            Args:
+                rs:
+                    A list of rotation objects
+                dim:
+                    The dimension along which the rotations should be
+                    concatenated
+            Returns:
+                A concatenated Rotation object in rotation matrix format
+        """
+        rot_mats = [r.get_rot_mats() for r in rs]
+        rot_mats = torch.cat(rot_mats, dim=dim if dim >= 0 else dim - 2)
+        return Rotation(rot_mats=rot_mats, quats=None)
+    def map_tensor_fn(self,
+        fn
+    ):
+        """
+            Apply a Tensor -> Tensor function to underlying rotation tensors,
+            mapping over the rotation dimension(s). Can be used e.g. to sum out
+            a one-hot batch dimension.
+            Args:
+                fn:
+                    A Tensor -> Tensor function to be mapped over the Rotation
+            Returns:
+                The transformed Rotation object
+        """
+        if(self._rot_mats is not None):
+            rot_mats = self._rot_mats.view(self._rot_mats.shape[:-2] + (9,))
+            rot_mats = torch.stack(
+                list(map(fn, torch.unbind(rot_mats, dim=-1))), dim=-1
+            )
+            rot_mats = rot_mats.view(rot_mats.shape[:-1] + (3, 3))
+            return Rotation(rot_mats=rot_mats, quats=None)
+        elif(self._quats is not None):
+            quats = torch.stack(
+                list(map(fn, torch.unbind(self._quats, dim=-1))), dim=-1
+            )
+            return Rotation(rot_mats=None, quats=quats, normalize_quats=False)
+        else:
+            raise ValueError("Both rotations are None")
+    def cuda(self):
+        """
+            Analogous to the cuda() method of torch Tensors
+            Returns:
+                A copy of the Rotation in CUDA memory
+        """
+        if(self._rot_mats is not None):
+            return Rotation(rot_mats=self._rot_mats.cuda(), quats=None)
+        elif(self._quats is not None):
+            return Rotation(
+                rot_mats=None,
+                quats=self._quats.cuda(),
+                normalize_quats=False
+            )
+        else:
+            raise ValueError("Both rotations are None")
+    def to(self,
+        device: Optional[torch.device],
+        dtype: Optional[torch.dtype]
+    ):
+        """
+            Analogous to the to() method of torch Tensors
+            Args:
+                device:
+                    A torch device
+                dtype:
+                    A torch dtype
+            Returns:
+                A copy of the Rotation using the new device and dtype
+        """
+        if(self._rot_mats is not None):
+            return Rotation(
+                rot_mats=self._rot_mats.to(device=device, dtype=dtype),
+                quats=None,
+            )
+        elif(self._quats is not None):
+            return Rotation(
+                rot_mats=None,
+                quats=self._quats.to(device=device, dtype=dtype),
+                normalize_quats=False,
+            )
+        else:
+            raise ValueError("Both rotations are None")
+    def detach(self):
+        """
+            Returns a copy of the Rotation whose underlying Tensor has been
+            detached from its torch graph.
+            Returns:
+                A copy of the Rotation whose underlying Tensor has been detached
+                from its torch graph
+        """
+        if(self._rot_mats is not None):
+            return Rotation(rot_mats=self._rot_mats.detach(), quats=None)
+        elif(self._quats is not None):
+            return Rotation(
+                rot_mats=None,
+                quats=self._quats.detach(),
+                normalize_quats=False,
+            )
+        else:
+            raise ValueError("Both rotations are None")
+class Rigid:
+    """
+        A class representing a rigid transformation. Little more than a wrapper
+        around two objects: a Rotation object and a [*, 3] translation
+        Designed to behave approximately like a single torch tensor with the
+        shape of the shared batch dimensions of its component parts.
+    """
+    def __init__(self,
+        rots: Optional[Rotation],
+        trans: Optional[torch.Tensor],
+    ):
+        """
+            Args:
+                rots: A [*, 3, 3] rotation tensor
+                trans: A corresponding [*, 3] translation tensor
+        """
+        # (we need device, dtype, etc. from at least one input)
+        batch_dims, dtype, device, requires_grad = None, None, None, None
+        if(trans is not None):
+            batch_dims = trans.shape[:-1]
+            dtype = trans.dtype
+            device = trans.device
+            requires_grad = trans.requires_grad
+        elif(rots is not None):
+            batch_dims = rots.shape
+            dtype = rots.dtype
+            device = rots.device
+            requires_grad = rots.requires_grad
+        else:
+            raise ValueError("At least one input argument must be specified")
+        if(rots is None):
+            rots = Rotation.identity(
+                batch_dims, dtype, device, requires_grad,
+            )
+        elif(trans is None):
+            trans = identity_trans(
+                batch_dims, dtype, device, requires_grad,
+            )
+        if((rots.shape != trans.shape[:-1]) or
+           (rots.device != trans.device)):
+            raise ValueError("Rots and trans incompatible")
+        # Force full precision. Happens to the rotations automatically.
+        trans = trans.type(torch.float32)
+        self._rots = rots
+        self._trans = trans
+    @staticmethod
+    def identity(
+        shape: Tuple[int],
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        requires_grad: bool = True,
+        fmt: str = "quat",
+    ):
+        """
+            Constructs an identity transformation.
+            Args:
+                shape:
+                    The desired shape
+                dtype:
+                    The dtype of both internal tensors
+                device:
+                    The device of both internal tensors
+                requires_grad:
+                    Whether grad should be enabled for the internal tensors
+            Returns:
+                The identity transformation
+        """
+        return Rigid(
+            Rotation.identity(shape, dtype, device, requires_grad, fmt=fmt),
+            identity_trans(shape, dtype, device, requires_grad),
+        )
+    def __getitem__(self,
+        index: Any,
+    ):
+        """
+            Indexes the affine transformation with PyTorch-style indices.
+            The index is applied to the shared dimensions of both the rotation
+            and the translation.
+            E.g.::
+                r = Rotation(rot_mats=torch.rand(10, 10, 3, 3), quats=None)
+                t = Rigid(r, torch.rand(10, 10, 3))
+                indexed = t[3, 4:6]
+                assert(indexed.shape == (2,))
+                assert(indexed.get_rots().shape == (2,))
+                assert(indexed.get_trans().shape == (2, 3))
+            Args:
+                index: A standard torch tensor index. E.g. 8, (10, None, 3),
+                or (3, slice(0, 1, None))
+            Returns:
+                The indexed tensor
+        """
+        if type(index) != tuple:
+            index = (index,)
+        return Rigid(
+            self._rots[index],
+            self._trans[index + (slice(None),)],
+        )
+    def __mul__(self,
+        right: torch.Tensor,
+    ):
+        """
+            Pointwise left multiplication of the transformation with a tensor.
+            Can be used to e.g. mask the Rigid.
+            Args:
+                right:
+                    The tensor multiplicand
+            Returns:
+                The product
+        """
+        if not(isinstance(right, torch.Tensor)):
+            raise TypeError("The other multiplicand must be a Tensor")
+        new_rots = self._rots * right
+        new_trans = self._trans * right[..., None]
+        return Rigid(new_rots, new_trans)
+    def __rmul__(self,
+        left: torch.Tensor,
+    ):
+        """
+            Reverse pointwise multiplication of the transformation with a
+            tensor.
+            Args:
+                left:
+                    The left multiplicand
+            Returns:
+                The product
+        """
+        return self.__mul__(left)
+    @property
+    def shape(self) -> torch.Size:
+        """
+            Returns the shape of the shared dimensions of the rotation and
+            the translation.
+            Returns:
+                The shape of the transformation
+        """
+        s = self._trans.shape[:-1]
+        return s
+    @property
+    def device(self) -> torch.device:
+        """
+            Returns the device on which the Rigid's tensors are located.
+            Returns:
+                The device on which the Rigid's tensors are located
+        """
+        return self._trans.device
+    def get_rots(self) -> Rotation:
+        """
+            Getter for the rotation.
+            Returns:
+                The rotation object
+        """
+        return self._rots
+    def get_trans(self) -> torch.Tensor:
+        """
+            Getter for the translation.
+            Returns:
+                The stored translation
+        """
+        return self._trans
+    def compose_q_update_vec(self,
+        q_update_vec: torch.Tensor,
+        update_mask: torch.Tensor=None,
+    ):
+        """
+            Composes the transformation with a quaternion update vector of
+            shape [*, 6], where the final 6 columns represent the x, y, and
+            z values of a quaternion of form (1, x, y, z) followed by a 3D
+            translation.
+            Args:
+                q_vec: The quaternion update vector.
+            Returns:
+                The composed transformation.
+        """
+        q_vec, t_vec = q_update_vec[..., :3], q_update_vec[..., 3:]
+        new_rots = self._rots.compose_q_update_vec(
+            q_vec, update_mask=update_mask)
+        trans_update = self._rots.apply(t_vec)
+        if update_mask is not None:
+            trans_update = trans_update * update_mask
+        new_translation = self._trans + trans_update
+        return Rigid(new_rots, new_translation)
+    def compose(self,
+        r,
+    ):
+        """
+            Composes the current rigid object with another.
+            Args:
+                r:
+                    Another Rigid object
+            Returns:
+                The composition of the two transformations
+        """
+        new_rot = self._rots.compose_r(r._rots)
+        new_trans = self._rots.apply(r._trans) + self._trans
+        return Rigid(new_rot, new_trans)
+    def compose_r(self,
+        rot,
+        order='right'
+    ):
+        """
+            Composes the current rigid object with another.
+            Args:
+                r:
+                    Another Rigid object
+                order:
+                    Order in which to perform rotation multiplication.
+            Returns:
+                The composition of the two transformations
+        """
+        if order == 'right':
+            new_rot = self._rots.compose_r(rot)
+        elif order == 'left':
+            new_rot = rot.compose_r(self._rots)
+        else:
+            raise ValueError(f'Unrecognized multiplication order: {order}')
+        return Rigid(new_rot, self._trans)
+    def apply(self,
+        pts: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+            Applies the transformation to a coordinate tensor.
+            Args:
+                pts: A [*, 3] coordinate tensor.
+            Returns:
+                The transformed points.
+        """
+        rotated = self._rots.apply(pts)
+        return rotated + self._trans
+    def invert_apply(self,
+        pts: torch.Tensor
+    ) -> torch.Tensor:
+        """
+            Applies the inverse of the transformation to a coordinate tensor.
+            Args:
+                pts: A [*, 3] coordinate tensor
+            Returns:
+                The transformed points.
+        """
+        pts = pts - self._trans
+        return self._rots.invert_apply(pts)
+    def invert(self):
+        """
+            Inverts the transformation.
+            Returns:
+                The inverse transformation.
+        """
+        rot_inv = self._rots.invert()
+        trn_inv = rot_inv.apply(self._trans)
+        return Rigid(rot_inv, -1 * trn_inv)
+    def map_tensor_fn(self,
+        fn
+    ):
+        """
+            Apply a Tensor -> Tensor function to underlying translation and
+            rotation tensors, mapping over the translation/rotation dimensions
+            respectively.
+            Args:
+                fn:
+                    A Tensor -> Tensor function to be mapped over the Rigid
+            Returns:
+                The transformed Rigid object
+        """
+        new_rots = self._rots.map_tensor_fn(fn)
+        new_trans = torch.stack(
+            list(map(fn, torch.unbind(self._trans, dim=-1))),
+            dim=-1
+        )
+        return Rigid(new_rots, new_trans)
+    def to_tensor_4x4(self) -> torch.Tensor:
+        """
+            Converts a transformation to a homogenous transformation tensor.
+            Returns:
+                A [*, 4, 4] homogenous transformation tensor
+        """
+        tensor = self._trans.new_zeros((*self.shape, 4, 4))
+        tensor[..., :3, :3] = self._rots.get_rot_mats()
+        tensor[..., :3, 3] = self._trans
+        tensor[..., 3, 3] = 1
+        return tensor
+    @staticmethod
+    def from_tensor_4x4(
+        t: torch.Tensor
+    ):
+        """
+            Constructs a transformation from a homogenous transformation
+            tensor.
+            Args:
+                t: [*, 4, 4] homogenous transformation tensor
+            Returns:
+                T object with shape [*]
+        """
+        if(t.shape[-2:] != (4, 4)):
+            raise ValueError("Incorrectly shaped input tensor")
+        rots = Rotation(rot_mats=t[..., :3, :3], quats=None)
+        trans = t[..., :3, 3]
+        return Rigid(rots, trans)
+    def to_tensor_7(self) -> torch.Tensor:
+        """
+            Converts a transformation to a tensor with 7 final columns, four
+            for the quaternion followed by three for the translation.
+            Returns:
+                A [*, 7] tensor representation of the transformation
+        """
+        tensor = self._trans.new_zeros((*self.shape, 7))
+        tensor[..., :4] = self._rots.get_quats()
+        tensor[..., 4:] = self._trans
+        return tensor
+    @staticmethod
+    def from_tensor_7(
+        t: torch.Tensor,
+        normalize_quats: bool = False,
+    ):
+        if(t.shape[-1] != 7):
+            raise ValueError("Incorrectly shaped input tensor")
+        quats, trans = t[..., :4], t[..., 4:]
+        rots = Rotation(
+            rot_mats=None,
+            quats=quats,
+            normalize_quats=normalize_quats
+        )
+        return Rigid(rots, trans)
+    @staticmethod
+    def from_3_points(
+        p_neg_x_axis: torch.Tensor,
+        origin: torch.Tensor,
+        p_xy_plane: torch.Tensor,
+        eps: float = 1e-8
+    ):
+        """
+            Implements algorithm 21. Constructs transformations from sets of 3
+            points using the Gram-Schmidt algorithm.
+            Args:
+                p_neg_x_axis: [*, 3] coordinates
+                origin: [*, 3] coordinates used as frame origins
+                p_xy_plane: [*, 3] coordinates
+                eps: Small epsilon value
+            Returns:
+                A transformation object of shape [*]
+        """
+        p_neg_x_axis = torch.unbind(p_neg_x_axis, dim=-1)
+        origin = torch.unbind(origin, dim=-1)
+        p_xy_plane = torch.unbind(p_xy_plane, dim=-1)
+        e0 = [c1 - c2 for c1, c2 in zip(origin, p_neg_x_axis)]
+        e1 = [c1 - c2 for c1, c2 in zip(p_xy_plane, origin)]
+        denom = torch.sqrt(sum((c * c for c in e0)) + eps)
+        e0 = [c / denom for c in e0]
+        dot = sum((c1 * c2 for c1, c2 in zip(e0, e1)))
+        e1 = [c2 - c1 * dot for c1, c2 in zip(e0, e1)]
+        denom = torch.sqrt(sum((c * c for c in e1)) + eps)
+        e1 = [c / denom for c in e1]
+        e2 = [
+            e0[1] * e1[2] - e0[2] * e1[1],
+            e0[2] * e1[0] - e0[0] * e1[2],
+            e0[0] * e1[1] - e0[1] * e1[0],
+        ]
+        rots = torch.stack([c for tup in zip(e0, e1, e2) for c in tup], dim=-1)
+        rots = rots.reshape(rots.shape[:-1] + (3, 3))
+        rot_obj = Rotation(rot_mats=rots, quats=None)
+        return Rigid(rot_obj, torch.stack(origin, dim=-1))
+    def unsqueeze(self,
+        dim: int,
+    ):
+        """
+            Analogous to torch.unsqueeze. The dimension is relative to the
+            shared dimensions of the rotation/translation.
+            Args:
+                dim: A positive or negative dimension index.
+            Returns:
+                The unsqueezed transformation.
+        """
+        if dim >= len(self.shape):
+            raise ValueError("Invalid dimension")
+        rots = self._rots.unsqueeze(dim)
+        trans = self._trans.unsqueeze(dim if dim >= 0 else dim - 1)
+        return Rigid(rots, trans)
+    @staticmethod
+    def cat(
+        ts,
+        dim: int,
+    ):
+        """
+            Concatenates transformations along a new dimension.
+            Args:
+                ts:
+                    A list of T objects
+                dim:
+                    The dimension along which the transformations should be
+                    concatenated
+            Returns:
+                A concatenated transformation object
+        """
+        rots = Rotation.cat([t._rots for t in ts], dim)
+        trans = torch.cat(
+            [t._trans for t in ts], dim=dim if dim >= 0 else dim - 1
+        )
+        return Rigid(rots, trans)
+    def apply_rot_fn(self, fn):
+        """
+            Applies a Rotation -> Rotation function to the stored rotation
+            object.
+            Args:
+                fn: A function of type Rotation -> Rotation
+            Returns:
+                A transformation object with a transformed rotation.
+        """
+        return Rigid(fn(self._rots), self._trans)
+    def apply_trans_fn(self, fn):
+        """
+            Applies a Tensor -> Tensor function to the stored translation.
+            Args:
+                fn:
+                    A function of type Tensor -> Tensor to be applied to the
+                    translation
+            Returns:
+                A transformation object with a transformed translation.
+        """
+        return Rigid(self._rots, fn(self._trans))
+    def scale_translation(self, trans_scale_factor: float):
+        """
+            Scales the translation by a constant factor.
+            Args:
+                trans_scale_factor:
+                    The constant factor
+            Returns:
+                A transformation object with a scaled translation.
+        """
+        fn = lambda t: t * trans_scale_factor
+        return self.apply_trans_fn(fn)
+    def stop_rot_gradient(self):
+        """
+            Detaches the underlying rotation object
+            Returns:
+                A transformation object with detached rotations
+        """
+        fn = lambda r: r.detach()
+        return self.apply_rot_fn(fn)
+    @staticmethod
+    def make_transform_from_reference(n_xyz, ca_xyz, c_xyz, eps=1e-20):
+        """
+            Returns a transformation object from reference coordinates.
+            Note that this method does not take care of symmetries. If you
+            provide the atom positions in the non-standard way, the N atom will
+            end up not at [-0.527250, 1.359329, 0.0] but instead at
+            [-0.527250, -1.359329, 0.0]. You need to take care of such cases in
+            your code.
+            Args:
+                n_xyz: A [*, 3] tensor of nitrogen xyz coordinates.
+                ca_xyz: A [*, 3] tensor of carbon alpha xyz coordinates.
+                c_xyz: A [*, 3] tensor of carbon xyz coordinates.
+            Returns:
+                A transformation object. After applying the translation and
+                rotation to the reference backbone, the coordinates will
+                approximately equal to the input coordinates.
+        """
+        translation = -1 * ca_xyz
+        n_xyz = n_xyz + translation
+        c_xyz = c_xyz + translation
+        c_x, c_y, c_z = [c_xyz[..., i] for i in range(3)]
+        norm = torch.sqrt(eps + c_x ** 2 + c_y ** 2)
+        sin_c1 = -c_y / norm
+        cos_c1 = c_x / norm
+        zeros = sin_c1.new_zeros(sin_c1.shape)
+        ones = sin_c1.new_ones(sin_c1.shape)
+        c1_rots = sin_c1.new_zeros((*sin_c1.shape, 3, 3))
+        c1_rots[..., 0, 0] = cos_c1
+        c1_rots[..., 0, 1] = -1 * sin_c1
+        c1_rots[..., 1, 0] = sin_c1
+        c1_rots[..., 1, 1] = cos_c1
+        c1_rots[..., 2, 2] = 1
+        norm = torch.sqrt(eps + c_x ** 2 + c_y ** 2 + c_z ** 2)
+        sin_c2 = c_z / norm
+        cos_c2 = torch.sqrt(c_x ** 2 + c_y ** 2) / norm
+        c2_rots = sin_c2.new_zeros((*sin_c2.shape, 3, 3))
+        c2_rots[..., 0, 0] = cos_c2
+        c2_rots[..., 0, 2] = sin_c2
+        c2_rots[..., 1, 1] = 1
+        c1_rots[..., 2, 0] = -1 * sin_c2
+        c1_rots[..., 2, 2] = cos_c2
+        c_rots = rot_matmul(c2_rots, c1_rots)
+        n_xyz = rot_vec_mul(c_rots, n_xyz)
+        _, n_y, n_z = [n_xyz[..., i] for i in range(3)]
+        norm = torch.sqrt(eps + n_y ** 2 + n_z ** 2)
+        sin_n = -n_z / norm
+        cos_n = n_y / norm
+        n_rots = sin_c2.new_zeros((*sin_c2.shape, 3, 3))
+        n_rots[..., 0, 0] = 1
+        n_rots[..., 1, 1] = cos_n
+        n_rots[..., 1, 2] = -1 * sin_n
+        n_rots[..., 2, 1] = sin_n
+        n_rots[..., 2, 2] = cos_n
+        rots = rot_matmul(n_rots, c_rots)
+        rots = rots.transpose(-1, -2)
+        translation = -1 * translation
+        rot_obj = Rotation(rot_mats=rots, quats=None)
+        return Rigid(rot_obj, translation)
+    def cuda(self):
+        """
+            Moves the transformation object to GPU memory
+            Returns:
+                A version of the transformation on GPU
+        """
+        return Rigid(self._rots.cuda(), self._trans.cuda())

analysis/src/common/rotation3d.py ADDED Viewed

	@@ -0,0 +1,596 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Optional, Union
+import torch
+from torch.nn import functional as F
+Device = Union[str, torch.device]
+"""
+The transformation matrices returned from the functions in this file assume
+the points on which the transformation will be applied are column vectors.
+i.e. the R matrix is structured as
+    R = [
+            [Rxx, Rxy, Rxz],
+            [Ryx, Ryy, Ryz],
+            [Rzx, Rzy, Rzz],
+        ]  # (3, 3)
+This matrix can be applied to column vectors by post multiplication
+by the points e.g.
+    points = [[0], [1], [2]]  # (3 x 1) xyz coordinates of a point
+    transformed_points = R * points
+To apply the same matrix to points which are row vectors, the R matrix
+can be transposed and pre multiplied by the points:
+e.g.
+    points = [[0, 1, 2]]  # (1 x 3) xyz coordinates of a point
+    transformed_points = points * R.transpose(1, 0)
+"""
+def quaternion_to_matrix(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as quaternions to rotation matrices.
+    Args:
+        quaternions: quaternions with real part first,
+            as tensor of shape (..., 4).
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    r, i, j, k = torch.unbind(quaternions, -1)
+    # pyre-fixme[58]: `/` is not supported for operand types `float` and `Tensor`.
+    two_s = 2.0 / (quaternions * quaternions).sum(-1)
+    o = torch.stack(
+        (
+            1 - two_s * (j * j + k * k),
+            two_s * (i * j - k * r),
+            two_s * (i * k + j * r),
+            two_s * (i * j + k * r),
+            1 - two_s * (i * i + k * k),
+            two_s * (j * k - i * r),
+            two_s * (i * k - j * r),
+            two_s * (j * k + i * r),
+            1 - two_s * (i * i + j * j),
+        ),
+        -1,
+    )
+    return o.reshape(quaternions.shape[:-1] + (3, 3))
+def _copysign(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+    """
+    Return a tensor where each element has the absolute value taken from the,
+    corresponding element of a, with sign taken from the corresponding
+    element of b. This is like the standard copysign floating-point operation,
+    but is not careful about negative 0 and NaN.
+    Args:
+        a: source tensor.
+        b: tensor whose signs will be used, of the same shape as a.
+    Returns:
+        Tensor of the same shape as a with the signs of b.
+    """
+    signs_differ = (a < 0) != (b < 0)
+    return torch.where(signs_differ, -a, a)
+def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor:
+    """
+    Returns torch.sqrt(torch.max(0, x))
+    but with a zero subgradient where x is 0.
+    """
+    ret = torch.zeros_like(x)
+    positive_mask = x > 0
+    ret[positive_mask] = torch.sqrt(x[positive_mask])
+    return ret
+def matrix_to_quaternion(matrix: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as rotation matrices to quaternions.
+    Args:
+        matrix: Rotation matrices as tensor of shape (..., 3, 3).
+    Returns:
+        quaternions with real part first, as tensor of shape (..., 4).
+    """
+    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
+        raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.")
+    batch_dim = matrix.shape[:-2]
+    m00, m01, m02, m10, m11, m12, m20, m21, m22 = torch.unbind(
+        matrix.reshape(batch_dim + (9,)), dim=-1
+    )
+    q_abs = _sqrt_positive_part(
+        torch.stack(
+            [
+                1.0 + m00 + m11 + m22,
+                1.0 + m00 - m11 - m22,
+                1.0 - m00 + m11 - m22,
+                1.0 - m00 - m11 + m22,
+            ],
+            dim=-1,
+        )
+    )
+    # we produce the desired quaternion multiplied by each of r, i, j, k
+    quat_by_rijk = torch.stack(
+        [
+            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
+            #  `int`.
+            torch.stack([q_abs[..., 0] ** 2, m21 - m12, m02 - m20, m10 - m01], dim=-1),
+            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
+            #  `int`.
+            torch.stack([m21 - m12, q_abs[..., 1] ** 2, m10 + m01, m02 + m20], dim=-1),
+            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
+            #  `int`.
+            torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] ** 2, m12 + m21], dim=-1),
+            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
+            #  `int`.
+            torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3] ** 2], dim=-1),
+        ],
+        dim=-2,
+    )
+    # We floor here at 0.1 but the exact level is not important; if q_abs is small,
+    # the candidate won't be picked.
+    flr = torch.tensor(0.1).to(dtype=q_abs.dtype, device=q_abs.device)
+    quat_candidates = quat_by_rijk / (2.0 * q_abs[..., None].max(flr))
+    # if not for numerical problems, quat_candidates[i] should be same (up to a sign),
+    # forall i; we pick the best-conditioned one (with the largest denominator)
+    return quat_candidates[
+        F.one_hot(q_abs.argmax(dim=-1), num_classes=4) > 0.5, :
+    ].reshape(batch_dim + (4,))
+def _axis_angle_rotation(axis: str, angle: torch.Tensor) -> torch.Tensor:
+    """
+    Return the rotation matrices for one of the rotations about an axis
+    of which Euler angles describe, for each value of the angle given.
+    Args:
+        axis: Axis label "X" or "Y or "Z".
+        angle: any shape tensor of Euler angles in radians
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    cos = torch.cos(angle)
+    sin = torch.sin(angle)
+    one = torch.ones_like(angle)
+    zero = torch.zeros_like(angle)
+    if axis == "X":
+        R_flat = (one, zero, zero, zero, cos, -sin, zero, sin, cos)
+    elif axis == "Y":
+        R_flat = (cos, zero, sin, zero, one, zero, -sin, zero, cos)
+    elif axis == "Z":
+        R_flat = (cos, -sin, zero, sin, cos, zero, zero, zero, one)
+    else:
+        raise ValueError("letter must be either X, Y or Z.")
+    return torch.stack(R_flat, -1).reshape(angle.shape + (3, 3))
+def euler_angles_to_matrix(euler_angles: torch.Tensor, convention: str) -> torch.Tensor:
+    """
+    Convert rotations given as Euler angles in radians to rotation matrices.
+    Args:
+        euler_angles: Euler angles in radians as tensor of shape (..., 3).
+        convention: Convention string of three uppercase letters from
+            {"X", "Y", and "Z"}.
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    if euler_angles.dim() == 0 or euler_angles.shape[-1] != 3:
+        raise ValueError("Invalid input euler angles.")
+    if len(convention) != 3:
+        raise ValueError("Convention must have 3 letters.")
+    if convention[1] in (convention[0], convention[2]):
+        raise ValueError(f"Invalid convention {convention}.")
+    for letter in convention:
+        if letter not in ("X", "Y", "Z"):
+            raise ValueError(f"Invalid letter {letter} in convention string.")
+    matrices = [
+        _axis_angle_rotation(c, e)
+        for c, e in zip(convention, torch.unbind(euler_angles, -1))
+    ]
+    # return functools.reduce(torch.matmul, matrices)
+    return torch.matmul(torch.matmul(matrices[0], matrices[1]), matrices[2])
+def _angle_from_tan(
+    axis: str, other_axis: str, data, horizontal: bool, tait_bryan: bool
+) -> torch.Tensor:
+    """
+    Extract the first or third Euler angle from the two members of
+    the matrix which are positive constant times its sine and cosine.
+    Args:
+        axis: Axis label "X" or "Y or "Z" for the angle we are finding.
+        other_axis: Axis label "X" or "Y or "Z" for the middle axis in the
+            convention.
+        data: Rotation matrices as tensor of shape (..., 3, 3).
+        horizontal: Whether we are looking for the angle for the third axis,
+            which means the relevant entries are in the same row of the
+            rotation matrix. If not, they are in the same column.
+        tait_bryan: Whether the first and third axes in the convention differ.
+    Returns:
+        Euler Angles in radians for each matrix in data as a tensor
+        of shape (...).
+    """
+    i1, i2 = {"X": (2, 1), "Y": (0, 2), "Z": (1, 0)}[axis]
+    if horizontal:
+        i2, i1 = i1, i2
+    even = (axis + other_axis) in ["XY", "YZ", "ZX"]
+    if horizontal == even:
+        return torch.atan2(data[..., i1], data[..., i2])
+    if tait_bryan:
+        return torch.atan2(-data[..., i2], data[..., i1])
+    return torch.atan2(data[..., i2], -data[..., i1])
+def _index_from_letter(letter: str) -> int:
+    if letter == "X":
+        return 0
+    if letter == "Y":
+        return 1
+    if letter == "Z":
+        return 2
+    raise ValueError("letter must be either X, Y or Z.")
+def matrix_to_euler_angles(matrix: torch.Tensor, convention: str) -> torch.Tensor:
+    """
+    Convert rotations given as rotation matrices to Euler angles in radians.
+    Args:
+        matrix: Rotation matrices as tensor of shape (..., 3, 3).
+        convention: Convention string of three uppercase letters.
+    Returns:
+        Euler angles in radians as tensor of shape (..., 3).
+    """
+    if len(convention) != 3:
+        raise ValueError("Convention must have 3 letters.")
+    if convention[1] in (convention[0], convention[2]):
+        raise ValueError(f"Invalid convention {convention}.")
+    for letter in convention:
+        if letter not in ("X", "Y", "Z"):
+            raise ValueError(f"Invalid letter {letter} in convention string.")
+    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
+        raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.")
+    i0 = _index_from_letter(convention[0])
+    i2 = _index_from_letter(convention[2])
+    tait_bryan = i0 != i2
+    if tait_bryan:
+        central_angle = torch.asin(
+            matrix[..., i0, i2] * (-1.0 if i0 - i2 in [-1, 2] else 1.0)
+        )
+    else:
+        central_angle = torch.acos(matrix[..., i0, i0])
+    o = (
+        _angle_from_tan(
+            convention[0], convention[1], matrix[..., i2], False, tait_bryan
+        ),
+        central_angle,
+        _angle_from_tan(
+            convention[2], convention[1], matrix[..., i0, :], True, tait_bryan
+        ),
+    )
+    return torch.stack(o, -1)
+def random_quaternions(
+    n: int, dtype: Optional[torch.dtype] = None, device: Optional[Device] = None
+) -> torch.Tensor:
+    """
+    Generate random quaternions representing rotations,
+    i.e. versors with nonnegative real part.
+    Args:
+        n: Number of quaternions in a batch to return.
+        dtype: Type to return.
+        device: Desired device of returned tensor. Default:
+            uses the current device for the default tensor type.
+    Returns:
+        Quaternions as tensor of shape (N, 4).
+    """
+    if isinstance(device, str):
+        device = torch.device(device)
+    o = torch.randn((n, 4), dtype=dtype, device=device)
+    s = (o * o).sum(1)
+    o = o / _copysign(torch.sqrt(s), o[:, 0])[:, None]
+    return o
+def random_rotations(
+    n: int, dtype: Optional[torch.dtype] = None, device: Optional[Device] = None
+) -> torch.Tensor:
+    """
+    Generate random rotations as 3x3 rotation matrices.
+    Args:
+        n: Number of rotation matrices in a batch to return.
+        dtype: Type to return.
+        device: Device of returned tensor. Default: if None,
+            uses the current device for the default tensor type.
+    Returns:
+        Rotation matrices as tensor of shape (n, 3, 3).
+    """
+    quaternions = random_quaternions(n, dtype=dtype, device=device)
+    return quaternion_to_matrix(quaternions)
+def random_rotation(
+    dtype: Optional[torch.dtype] = None, device: Optional[Device] = None
+) -> torch.Tensor:
+    """
+    Generate a single random 3x3 rotation matrix.
+    Args:
+        dtype: Type to return
+        device: Device of returned tensor. Default: if None,
+            uses the current device for the default tensor type
+    Returns:
+        Rotation matrix as tensor of shape (3, 3).
+    """
+    return random_rotations(1, dtype, device)[0]
+def standardize_quaternion(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Convert a unit quaternion to a standard form: one in which the real
+    part is non negative.
+    Args:
+        quaternions: Quaternions with real part first,
+            as tensor of shape (..., 4).
+    Returns:
+        Standardized quaternions as tensor of shape (..., 4).
+    """
+    return torch.where(quaternions[..., 0:1] < 0, -quaternions, quaternions)
+def quaternion_raw_multiply(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+    """
+    Multiply two quaternions.
+    Usual torch rules for broadcasting apply.
+    Args:
+        a: Quaternions as tensor of shape (..., 4), real part first.
+        b: Quaternions as tensor of shape (..., 4), real part first.
+    Returns:
+        The product of a and b, a tensor of quaternions shape (..., 4).
+    """
+    aw, ax, ay, az = torch.unbind(a, -1)
+    bw, bx, by, bz = torch.unbind(b, -1)
+    ow = aw * bw - ax * bx - ay * by - az * bz
+    ox = aw * bx + ax * bw + ay * bz - az * by
+    oy = aw * by - ax * bz + ay * bw + az * bx
+    oz = aw * bz + ax * by - ay * bx + az * bw
+    return torch.stack((ow, ox, oy, oz), -1)
+def quaternion_multiply(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+    """
+    Multiply two quaternions representing rotations, returning the quaternion
+    representing their composition, i.e. the versor with nonnegative real part.
+    Usual torch rules for broadcasting apply.
+    Args:
+        a: Quaternions as tensor of shape (..., 4), real part first.
+        b: Quaternions as tensor of shape (..., 4), real part first.
+    Returns:
+        The product of a and b, a tensor of quaternions of shape (..., 4).
+    """
+    ab = quaternion_raw_multiply(a, b)
+    return standardize_quaternion(ab)
+def quaternion_invert(quaternion: torch.Tensor) -> torch.Tensor:
+    """
+    Given a quaternion representing rotation, get the quaternion representing
+    its inverse.
+    Args:
+        quaternion: Quaternions as tensor of shape (..., 4), with real part
+            first, which must be versors (unit quaternions).
+    Returns:
+        The inverse, a tensor of quaternions of shape (..., 4).
+    """
+    scaling = torch.tensor([1, -1, -1, -1], device=quaternion.device)
+    return quaternion * scaling
+def quaternion_apply(quaternion: torch.Tensor, point: torch.Tensor) -> torch.Tensor:
+    """
+    Apply the rotation given by a quaternion to a 3D point.
+    Usual torch rules for broadcasting apply.
+    Args:
+        quaternion: Tensor of quaternions, real part first, of shape (..., 4).
+        point: Tensor of 3D points of shape (..., 3).
+    Returns:
+        Tensor of rotated points of shape (..., 3).
+    """
+    if point.size(-1) != 3:
+        raise ValueError(f"Points are not in 3D, {point.shape}.")
+    real_parts = point.new_zeros(point.shape[:-1] + (1,))
+    point_as_quaternion = torch.cat((real_parts, point), -1)
+    out = quaternion_raw_multiply(
+        quaternion_raw_multiply(quaternion, point_as_quaternion),
+        quaternion_invert(quaternion),
+    )
+    return out[..., 1:]
+def axis_angle_to_matrix(axis_angle: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as axis/angle to rotation matrices.
+    Args:
+        axis_angle: Rotations given as a vector in axis angle form,
+            as a tensor of shape (..., 3), where the magnitude is
+            the angle turned anticlockwise in radians around the
+            vector's direction.
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    return quaternion_to_matrix(axis_angle_to_quaternion(axis_angle))
+def matrix_to_axis_angle(matrix: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as rotation matrices to axis/angle.
+    Args:
+        matrix: Rotation matrices as tensor of shape (..., 3, 3).
+    Returns:
+        Rotations given as a vector in axis angle form, as a tensor
+            of shape (..., 3), where the magnitude is the angle
+            turned anticlockwise in radians around the vector's
+            direction.
+    """
+    return quaternion_to_axis_angle(matrix_to_quaternion(matrix))
+def axis_angle_to_quaternion(axis_angle: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as axis/angle to quaternions.
+    Args:
+        axis_angle: Rotations given as a vector in axis angle form,
+            as a tensor of shape (..., 3), where the magnitude is
+            the angle turned anticlockwise in radians around the
+            vector's direction.
+    Returns:
+        quaternions with real part first, as tensor of shape (..., 4).
+    """
+    angles = torch.norm(axis_angle, p=2, dim=-1, keepdim=True)
+    half_angles = angles * 0.5
+    eps = 1e-6
+    small_angles = angles.abs() < eps
+    sin_half_angles_over_angles = torch.empty_like(angles)
+    sin_half_angles_over_angles[~small_angles] = (
+        torch.sin(half_angles[~small_angles]) / angles[~small_angles]
+    )
+    # for x small, sin(x/2) is about x/2 - (x/2)^3/6
+    # so sin(x/2)/x is about 1/2 - (x*x)/48
+    sin_half_angles_over_angles[small_angles] = (
+        0.5 - (angles[small_angles] * angles[small_angles]) / 48
+    )
+    quaternions = torch.cat(
+        [torch.cos(half_angles), axis_angle * sin_half_angles_over_angles], dim=-1
+    )
+    return quaternions
+def quaternion_to_axis_angle(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as quaternions to axis/angle.
+    Args:
+        quaternions: quaternions with real part first,
+            as tensor of shape (..., 4).
+    Returns:
+        Rotations given as a vector in axis angle form, as a tensor
+            of shape (..., 3), where the magnitude is the angle
+            turned anticlockwise in radians around the vector's
+            direction.
+    """
+    norms = torch.norm(quaternions[..., 1:], p=2, dim=-1, keepdim=True)
+    half_angles = torch.atan2(norms, quaternions[..., :1])
+    angles = 2 * half_angles
+    eps = 1e-6
+    small_angles = angles.abs() < eps
+    sin_half_angles_over_angles = torch.empty_like(angles)
+    sin_half_angles_over_angles[~small_angles] = (
+        torch.sin(half_angles[~small_angles]) / angles[~small_angles]
+    )
+    # for x small, sin(x/2) is about x/2 - (x/2)^3/6
+    # so sin(x/2)/x is about 1/2 - (x*x)/48
+    sin_half_angles_over_angles[small_angles] = (
+        0.5 - (angles[small_angles] * angles[small_angles]) / 48
+    )
+    return quaternions[..., 1:] / sin_half_angles_over_angles
+def rotation_6d_to_matrix(d6: torch.Tensor) -> torch.Tensor:
+    """
+    Converts 6D rotation representation by Zhou et al. [1] to rotation matrix
+    using Gram--Schmidt orthogonalization per Section B of [1].
+    Args:
+        d6: 6D rotation representation, of size (*, 6)
+    Returns:
+        batch of rotation matrices of size (*, 3, 3)
+    [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
+    On the Continuity of Rotation Representations in Neural Networks.
+    IEEE Conference on Computer Vision and Pattern Recognition, 2019.
+    Retrieved from http://arxiv.org/abs/1812.07035
+    """
+    a1, a2 = d6[..., :3], d6[..., 3:]
+    b1 = F.normalize(a1, dim=-1)
+    b2 = a2 - (b1 * a2).sum(-1, keepdim=True) * b1
+    b2 = F.normalize(b2, dim=-1)
+    b3 = torch.cross(b1, b2, dim=-1)
+    return torch.stack((b1, b2, b3), dim=-2)
+def matrix_to_rotation_6d(matrix: torch.Tensor) -> torch.Tensor:
+    """
+    Converts rotation matrices to 6D rotation representation by Zhou et al. [1]
+    by dropping the last row. Note that 6D representation is not unique.
+    Args:
+        matrix: batch of rotation matrices of size (*, 3, 3)
+    Returns:
+        6D rotation representation, of size (*, 6)
+    [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
+    On the Continuity of Rotation Representations in Neural Networks.
+    IEEE Conference on Computer Vision and Pattern Recognition, 2019.
+    Retrieved from http://arxiv.org/abs/1812.07035
+    """
+    batch_dim = matrix.size()[:-2]
+    return matrix[..., :2, :].clone().reshape(batch_dim + (6,))

analysis/src/data/__init__.py ADDED Viewed

File without changes

analysis/src/data/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (141 Bytes). View file

analysis/src/data/__pycache__/protein_datamodule.cpython-39.pyc ADDED Viewed

Binary file (10.8 kB). View file

analysis/src/data/components/__init__.py ADDED Viewed

File without changes

analysis/src/data/components/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (152 Bytes). View file

analysis/src/data/components/__pycache__/dataset.cpython-39.pyc ADDED Viewed

Binary file (10.1 kB). View file

analysis/src/data/components/dataset.py ADDED Viewed

	@@ -0,0 +1,321 @@

+"""Protein dataset class."""
+import os
+import pickle
+from pathlib import Path
+from glob import glob
+from typing import Optional, Sequence, List, Union
+from functools import lru_cache
+import tree
+from tqdm import tqdm
+import numpy as np
+import pandas as pd
+import torch
+from src.common import residue_constants, data_transforms, rigid_utils, protein
+CA_IDX = residue_constants.atom_order['CA']
+DTYPE_MAPPING = {
+    'aatype': torch.long,
+    'atom_positions': torch.double,
+    'atom_mask': torch.double,
+}
+class ProteinFeatureTransform:
+    def __init__(self,
+                 unit: Optional[str] = 'angstrom',
+                 truncate_length: Optional[int] = None,
+                 strip_missing_residues: bool = True,
+                 recenter_and_scale: bool = True,
+                 eps: float = 1e-8,
+    ):
+        if unit == 'angstrom':
+            self.coordinate_scale = 1.0
+        elif unit in ('nm', 'nanometer'):
+            self.coordiante_scale = 0.1
+        else:
+            raise ValueError(f"Invalid unit: {unit}")
+        if truncate_length is not None:
+            assert truncate_length > 0, f"Invalid truncate_length: {truncate_length}"
+        self.truncate_length = truncate_length
+        self.strip_missing_residues = strip_missing_residues
+        self.recenter_and_scale = recenter_and_scale
+        self.eps = eps
+    def __call__(self, chain_feats):
+        chain_feats = self.patch_feats(chain_feats)
+        if self.strip_missing_residues:
+            chain_feats = self.strip_ends(chain_feats)
+        if self.truncate_length is not None:
+            chain_feats = self.random_truncate(chain_feats, max_len=self.truncate_length)
+        # Recenter and scale atom positions
+        if self.recenter_and_scale:
+            chain_feats = self.recenter_and_scale_coords(chain_feats, coordinate_scale=self.coordinate_scale, eps=self.eps)
+        # Map to torch Tensor
+        chain_feats = self.map_to_tensors(chain_feats)
+        # Add extra features from AF2
+        chain_feats = self.protein_data_transform(chain_feats)
+        # ** refer to line 170 in pdb_data_loader.py **
+        return chain_feats
+    @staticmethod
+    def patch_feats(chain_feats):
+        seq_mask = chain_feats['atom_mask'][:, CA_IDX]   # a little hack here
+        # residue_idx = np.arange(seq_mask.shape[0], dtype=np.int64)
+        residue_idx = chain_feats['residue_index'] - np.min(chain_feats['residue_index'])   # start from 0, possibly has chain break
+        patch_feats = {
+            'seq_mask': seq_mask,
+            'residue_mask': seq_mask,
+            'residue_idx': residue_idx,
+            'fixed_mask': np.zeros_like(seq_mask),
+            'sc_ca_t': np.zeros(seq_mask.shape + (3, )),
+        }
+        chain_feats.update(patch_feats)
+        return chain_feats
+    @staticmethod
+    def strip_ends(chain_feats):
+        # Strip missing residues on both ends
+        modeled_idx = np.where(chain_feats['aatype'] != 20)[0]
+        min_idx, max_idx = np.min(modeled_idx), np.max(modeled_idx)
+        chain_feats = tree.map_structure(
+                lambda x: x[min_idx : (max_idx+1)], chain_feats)
+        return chain_feats
+    @staticmethod
+    def random_truncate(chain_feats, max_len):
+        L = chain_feats['aatype'].shape[0]
+        if L > max_len:
+            # Randomly truncate
+            start = np.random.randint(0, L - max_len + 1)
+            end = start + max_len
+            chain_feats = tree.map_structure(
+                    lambda x: x[start : end], chain_feats)
+        return chain_feats
+    @staticmethod
+    def map_to_tensors(chain_feats):
+        chain_feats = {k: torch.as_tensor(v) for k,v in chain_feats.items()}
+        # Alter dtype
+        for k, dtype in DTYPE_MAPPING.items():
+            if k in chain_feats:
+                chain_feats[k] = chain_feats[k].type(dtype)
+        return chain_feats
+    @staticmethod
+    def recenter_and_scale_coords(chain_feats, coordinate_scale, eps=1e-8):
+        # recenter and scale atom positions
+        bb_pos = chain_feats['atom_positions'][:, CA_IDX]
+        bb_center = np.sum(bb_pos, axis=0) / (np.sum(chain_feats['seq_mask']) + eps)
+        centered_pos = chain_feats['atom_positions'] - bb_center[None, None, :]
+        scaled_pos = centered_pos * coordinate_scale
+        chain_feats['atom_positions'] = scaled_pos * chain_feats['atom_mask'][..., None]
+        return chain_feats
+    @staticmethod
+    def protein_data_transform(chain_feats):
+        chain_feats.update(
+            {
+                "all_atom_positions": chain_feats["atom_positions"],
+                "all_atom_mask": chain_feats["atom_mask"],
+            }
+        )
+        chain_feats = data_transforms.atom37_to_frames(chain_feats)
+        chain_feats = data_transforms.atom37_to_torsion_angles("")(chain_feats)
+        chain_feats = data_transforms.get_backbone_frames(chain_feats)
+        chain_feats = data_transforms.get_chi_angles(chain_feats)
+        chain_feats = data_transforms.make_pseudo_beta("")(chain_feats)
+        chain_feats = data_transforms.make_atom14_masks(chain_feats)
+        chain_feats = data_transforms.make_atom14_positions(chain_feats)
+        # Add convenient key
+        chain_feats.pop("all_atom_positions")
+        chain_feats.pop("all_atom_mask")
+        return chain_feats
+class MetadataFilter:
+    def __init__(self,
+                 min_len: Optional[int] = None,
+                 max_len: Optional[int] = None,
+                 min_chains: Optional[int] = None,
+                 max_chains: Optional[int] = None,
+                 min_resolution: Optional[int] = None,
+                 max_resolution: Optional[int] = None,
+                 include_structure_method: Optional[List[str]] = None,
+                 include_oligomeric_detail: Optional[List[str]] = None,
+                 **kwargs,
+    ):
+        self.min_len = min_len
+        self.max_len = max_len
+        self.min_chains = min_chains
+        self.max_chains = max_chains
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.include_structure_method = include_structure_method
+        self.include_oligomeric_detail = include_oligomeric_detail
+    def __call__(self, df):
+        _pre_filter_len = len(df)
+        if self.min_len is not None:
+            df = df[df['raw_seq_len'] >= self.min_len]
+        if self.max_len is not None:
+            df = df[df['raw_seq_len'] <= self.max_len]
+        if self.min_chains is not None:
+            df = df[df['num_chains'] >= self.min_chains]
+        if self.max_chains is not None:
+            df = df[df['num_chains'] <= self.max_chains]
+        if self.min_resolution is not None:
+            df = df[df['resolution'] >= self.min_resolution]
+        if self.max_resolution is not None:
+            df = df[df['resolution'] <= self.max_resolution]
+        if self.include_structure_method is not None:
+            df = df[df['include_structure_method'].isin(self.include_structure_method)]
+        if self.include_oligomeric_detail is not None:
+            df = df[df['include_oligomeric_detail'].isin(self.include_oligomeric_detail)]
+        print(f">>> Filter out {len(df)} samples out of {_pre_filter_len} by the metadata filter")
+        return df
+class RandomAccessProteinDataset(torch.utils.data.Dataset):
+    """Random access to pickle protein objects of dataset.
+    dict_keys(['atom_positions', 'aatype', 'atom_mask', 'residue_index', 'chain_index', 'b_factors'])
+    Note that each value is a ndarray in shape (L, *), for example:
+        'atom_positions': (L, 37, 3)
+    """
+    def __init__(self,
+                 path_to_dataset: Union[Path, str],
+                 path_to_seq_embedding: Optional[Path] = None,
+                 metadata_filter: Optional[MetadataFilter] = None,
+                 training: bool = True,
+                 transform: Optional[ProteinFeatureTransform] = None,
+                 suffix: Optional[str] = '.pkl',
+                 accession_code_fillter: Optional[Sequence[str]] = None,
+                 **kwargs,
+    ):
+        super().__init__()
+        path_to_dataset = os.path.expanduser(path_to_dataset)
+        suffix = suffix if suffix.startswith('.') else '.' + suffix
+        assert suffix in ('.pkl', '.pdb'), f"Invalid suffix: {suffix}"
+        if os.path.isfile(path_to_dataset): # path to csv file
+            assert path_to_dataset.endswith('.csv'), f"Invalid file extension: {path_to_dataset} (have to be .csv)"
+            self._df = pd.read_csv(path_to_dataset)
+            self._df.sort_values('modeled_seq_len', ascending=False)
+            if metadata_filter:
+                self._df = metadata_filter(self._df)
+            self._data = self._df['processed_complex_path'].tolist()
+        elif os.path.isdir(path_to_dataset):  # path to directory
+            self._data = sorted(glob(os.path.join(path_to_dataset, '*' + suffix)))
+            assert len(self._data) > 0, f"No {suffix} file found in '{path_to_dataset}'"
+        else:   # path as glob pattern
+            _pattern = path_to_dataset
+            self._data = sorted(glob(_pattern))
+            assert len(self._data) > 0, f"No files found in '{_pattern}'"
+        if accession_code_fillter and len(accession_code_fillter) > 0:
+            self._data = [p for p in self._data
+                if np.isin(os.path.splitext(os.path.basename(p))[0], accession_code_fillter)
+            ]
+        self.data = np.asarray(self._data)
+        self.path_to_seq_embedding = os.path.expanduser(path_to_seq_embedding) \
+                if path_to_seq_embedding is not None else None
+        self.suffix = suffix
+        self.transform = transform
+        self.training = training  # not implemented yet
+    @property
+    def num_samples(self):
+        return len(self.data)
+    def len(self):
+        return self.__len__()
+    def __len__(self):
+        return self.num_samples
+    def get(self, idx):
+        return self.__getitem__(idx)
+    @lru_cache(maxsize=100)
+    def __getitem__(self, idx):
+        """return single pyg.Data() instance
+        """
+        data_path = self.data[idx]
+        accession_code = os.path.splitext(os.path.basename(data_path))[0]
+        if self.suffix == '.pkl':
+            # Load pickled protein
+            with open(data_path, 'rb') as f:
+                data_object = pickle.load(f)
+        elif self.suffix == '.pdb':
+            # Load pdb file
+            with open(data_path, 'r') as f:
+                pdb_string = f.read()
+            data_object = protein.from_pdb_string(pdb_string).to_dict()
+        # Apply data transform
+        if self.transform is not None:
+            data_object = self.transform(data_object)
+        # Get sequence embedding if have
+        if self.path_to_seq_embedding is not None:
+            embed_dict = torch.load(
+                os.path.join(self.path_to_seq_embedding, f"{accession_code}.pt")
+            )
+            data_object.update(
+                {
+                    'seq_emb': embed_dict['representations'][33].float(),
+                } # 33 is for ESM650M
+            )
+        data_object['accession_code'] =  accession_code
+        return data_object  # dict of arrays
+class PretrainPDBDataset(RandomAccessProteinDataset):
+    def __init__(self,
+                 path_to_dataset: str,
+                 metadata_filter: MetadataFilter,
+                 transform: ProteinFeatureTransform,
+                 **kwargs,
+    ):
+        super(PretrainPDBDataset, self).__init__(path_to_dataset=path_to_dataset,
+                                                 metadata_filter=metadata_filter,
+                                                 transform=transform,
+                                                 **kwargs,
+        )
+class SamplingPDBDataset(RandomAccessProteinDataset):
+    def __init__(self,
+                 path_to_dataset: str,
+                 training: bool = False,
+                 suffix: str = '.pdb',
+                 transform: Optional[ProteinFeatureTransform] = None,
+                 accession_code_fillter: Optional[Sequence[str]] = None,
+    ):
+        assert os.path.isdir(path_to_dataset), f"Invalid path (expected to be directory): {path_to_dataset}"
+        super(SamplingPDBDataset, self).__init__(path_to_dataset=path_to_dataset,
+                                            training=training,
+                                            suffix=suffix,
+                                            transform=transform,
+                                            accession_code_fillter=accession_code_fillter,
+                                            metadata_filter=None,
+        )

analysis/src/data/protein_datamodule.py ADDED Viewed

	@@ -0,0 +1,242 @@

+from typing import Any, Dict, Optional, Tuple, List, Sequence
+import torch
+from torch.utils.data import ConcatDataset, DataLoader, Dataset, random_split
+from lightning import LightningDataModule
+from hydra.utils import instantiate
+class BatchTensorConverter:
+    """Callable to convert an unprocessed (labels + strings) batch to a
+    processed (labels + tensor) batch.
+    """
+    def __init__(self, target_keys: Optional[List] = None):
+        self.target_keys = target_keys
+    def __call__(self, raw_batch: Sequence[Dict[str, object]]):
+        B = len(raw_batch)
+        # Only do for Tensor
+        target_keys = self.target_keys \
+            if self.target_keys is not None else [k for k,v in raw_batch[0].items() if torch.is_tensor(v)]
+        # Non-array, for example string, int
+        non_array_keys = [k for k in raw_batch[0] if k not in target_keys]
+        collated_batch = dict()
+        for k in target_keys:
+            collated_batch[k] = self.collate_dense_tensors([d[k] for d in raw_batch], pad_v=0.0)
+        for k in non_array_keys:    # return non-array keys as is
+            collated_batch[k] = [d[k] for d in raw_batch]
+        return collated_batch
+    @staticmethod
+    def collate_dense_tensors(samples: Sequence, pad_v: float = 0.0):
+        """
+        Takes a list of tensors with the following dimensions:
+            [(d_11,       ...,           d_1K),
+             (d_21,       ...,           d_2K),
+             ...,
+             (d_N1,       ...,           d_NK)]
+        and stack + pads them into a single tensor of:
+        (N, max_i=1,N { d_i1 }, ..., max_i=1,N {diK})
+        """
+        if len(samples) == 0:
+            return torch.Tensor()
+        if len(set(x.dim() for x in samples)) != 1:
+            raise RuntimeError(
+                f"Samples has varying dimensions: {[x.dim() for x in samples]}"
+            )
+        (device,) = tuple(set(x.device for x in samples))  # assumes all on same device
+        max_shape = [max(lst) for lst in zip(*[x.shape for x in samples])]
+        result = torch.empty(
+            len(samples), *max_shape, dtype=samples[0].dtype, device=device
+        )
+        result.fill_(pad_v)
+        for i in range(len(samples)):
+            result_i = result[i]
+            t = samples[i]
+            result_i[tuple(slice(0, k) for k in t.shape)] = t
+        return result
+class ProteinDataModule(LightningDataModule):
+    """`LightningDataModule` for a single protein dataset,
+        for pretrain or finetune purpose.
+    ### To be revised.###
+    The MNIST database of handwritten digits has a training set of 60,000 examples, and a test set of 10,000 examples.
+    It is a subset of a larger set available from NIST. The digits have been size-normalized and centered in a
+    fixed-size image. The original black and white images from NIST were size normalized to fit in a 20x20 pixel box
+    while preserving their aspect ratio. The resulting images contain grey levels as a result of the anti-aliasing
+    technique used by the normalization algorithm. the images were centered in a 28x28 image by computing the center of
+    mass of the pixels, and translating the image so as to position this point at the center of the 28x28 field.
+    A `LightningDataModule` implements 7 key methods:
+    ```python
+        def prepare_data(self):
+        # Things to do on 1 GPU/TPU (not on every GPU/TPU in DDP).
+        # Download data, pre-process, split, save to disk, etc...
+        def setup(self, stage):
+        # Things to do on every process in DDP.
+        # Load data, set variables, etc...
+        def train_dataloader(self):
+        # return train dataloader
+        def val_dataloader(self):
+        # return validation dataloader
+        def test_dataloader(self):
+        # return test dataloader
+        def predict_dataloader(self):
+        # return predict dataloader
+        def teardown(self, stage):
+        # Called on every process in DDP.
+        # Clean up after fit or test.
+    ```
+    This allows you to share a full dataset without explaining how to download,
+    split, transform and process the data.
+    Read the docs:
+        https://lightning.ai/docs/pytorch/latest/data/datamodule.html
+    """
+    def __init__(
+        self,
+        dataset: torch.utils.data.Dataset,
+        batch_size: int = 64,
+        generator_seed: int = 42,
+        train_val_split: Tuple[float, float] = (0.95, 0.05),
+        num_workers: int = 0,
+        pin_memory: bool = False,
+        shuffle: bool = False,
+    ) -> None:
+        """Initialize a `MNISTDataModule`.
+        :param data_dir: The data directory. Defaults to `"data/"`.
+        :param train_val_test_split: The train, validation and test split. Defaults to `(55_000, 5_000, 10_000)`.
+        :param batch_size: The batch size. Defaults to `64`.
+        :param num_workers: The number of workers. Defaults to `0`.
+        :param pin_memory: Whether to pin memory. Defaults to `False`.
+        """
+        super().__init__()
+        # this line allows to access init params with 'self.hparams' attribute
+        # also ensures init params will be stored in ckpt
+        self.save_hyperparameters(logger=False)
+        self.dataset = dataset
+        self.data_train: Optional[Dataset] = None
+        self.data_val: Optional[Dataset] = None
+        self.data_test: Optional[Dataset] = None
+        self.batch_size_per_device = batch_size
+    def prepare_data(self) -> None:
+        """Download data if needed. Lightning ensures that `self.prepare_data()` is called only
+        within a single process on CPU, so you can safely add your downloading logic within. In
+        case of multi-node training, the execution of this hook depends upon
+        `self.prepare_data_per_node()`.
+        Do not use it to assign state (self.x = y).
+        """
+        pass
+    def setup(self, stage: Optional[str] = None) -> None:
+        """Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
+        This method is called by Lightning before `trainer.fit()`, `trainer.validate()`, `trainer.test()`, and
+        `trainer.predict()`, so be careful not to execute things like random split twice! Also, it is called after
+        `self.prepare_data()` and there is a barrier in between which ensures that all the processes proceed to
+        `self.setup()` once the data is prepared and available for use.
+        :param stage: The stage to setup. Either `"fit"`, `"validate"`, `"test"`, or `"predict"`. Defaults to ``None``.
+        """
+        # Divide batch size by the number of devices.
+        if self.trainer is not None:
+            if self.hparams.batch_size % self.trainer.world_size != 0:
+                raise RuntimeError(
+                    f"Batch size ({self.hparams.batch_size}) is not divisible by the number of devices ({self.trainer.world_size})."
+                )
+            self.batch_size_per_device = self.hparams.batch_size // self.trainer.world_size
+        # load and split datasets only if not loaded already
+        if stage == 'fit' and not self.data_train and not self.data_val:
+            # dataset = ConcatDataset(datasets=[trainset, testset])
+            self.data_train, self.data_val = random_split(
+                dataset=self.dataset,
+                lengths=self.hparams.train_val_split,
+                generator=torch.Generator().manual_seed(self.hparams.generator_seed),
+            )
+        elif stage in ('predict', 'test'):
+            self.data_test = self.dataset
+        else:
+            raise NotImplementedError(f"Stage {stage} not implemented.")
+    def _dataloader_template(self, dataset: Dataset[Any]) -> DataLoader[Any]:
+        """Create a dataloader from a dataset.
+        :param dataset: The dataset.
+        :return: The dataloader.
+        """
+        batch_collator = BatchTensorConverter()    # list of dicts -> dict of tensors
+        return DataLoader(
+            dataset=dataset,
+            collate_fn=batch_collator,
+            batch_size=self.batch_size_per_device,
+            num_workers=self.hparams.num_workers,
+            pin_memory=self.hparams.pin_memory,
+            shuffle=self.hparams.shuffle,
+        )
+    def train_dataloader(self) -> DataLoader[Any]:
+        """Create and return the train dataloader.
+        :return: The train dataloader.
+        """
+        return self._dataloader_template(self.data_train)
+    def val_dataloader(self) -> DataLoader[Any]:
+        """Create and return the validation dataloader.
+        :return: The validation dataloader.
+        """
+        return self._dataloader_template(self.data_val)
+    def test_dataloader(self) -> DataLoader[Any]:
+        """Create and return the test dataloader.
+        :return: The test dataloader.
+        """
+        return self._dataloader_template(self.data_test)
+    def teardown(self, stage: Optional[str] = None) -> None:
+        """Lightning hook for cleaning up after `trainer.fit()`, `trainer.validate()`,
+        `trainer.test()`, and `trainer.predict()`.
+        :param stage: The stage being torn down. Either `"fit"`, `"validate"`, `"test"`, or `"predict"`.
+            Defaults to ``None``.
+        """
+        pass
+    def state_dict(self) -> Dict[Any, Any]:
+        """Called when saving a checkpoint. Implement to generate and save the datamodule state.
+        :return: A dictionary containing the datamodule state that you want to save.
+        """
+        return {}
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
+        """Called when loading a checkpoint. Implement to reload datamodule state given datamodule
+        `state_dict()`.
+        :param state_dict: The datamodule state returned by `self.state_dict()`.
+        """
+        pass

analysis/src/eval.py ADDED Viewed

	@@ -0,0 +1,217 @@

+from typing import Any, Dict, List, Tuple
+import os
+from time import strftime
+import numpy as np
+import pandas as pd
+import torch
+# import hydra
+# import rootutils
+# from lightning import LightningDataModule, LightningModule, Trainer
+# from lightning.pytorch.loggers import Logger
+from omegaconf import DictConfig
+# rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
+# ------------------------------------------------------------------------------------ #
+# the setup_root above is equivalent to:
+# - adding project root dir to PYTHONPATH
+#       (so you don't need to force user to install project as a package)
+#       (necessary before importing any local modules e.g. `from src import utils`)
+# - setting up PROJECT_ROOT environment variable
+#       (which is used as a base for paths in "configs/paths/default.yaml")
+#       (this way all filepaths are the same no matter where you run the code)
+# - loading environment variables from ".env" in root dir
+#
+# you can remove it if you:
+# 1. either install project as a package or move entry files to project root dir
+# 2. set `root_dir` to "." in "configs/paths/default.yaml"
+#
+# more info: https://github.com/ashleve/rootutils
+# ------------------------------------------------------------------------------------ #
+from src.utils import (
+    RankedLogger,
+    extras,
+    instantiate_loggers,
+    log_hyperparameters,
+    task_wrapper,
+    checkpoint_utils,
+    plot_utils,
+)
+from src.common.pdb_utils import extract_backbone_coords
+from src.metrics import metrics
+from src.common.geo_utils import _find_rigid_alignment
+log = RankedLogger(__name__, rank_zero_only=True)
+def evaluate_prediction(pred_dir: str, target_dir: str = None, crystal_dir: str = None, tag: str = None):
+    """Evaluate prediction results based on pdb files.
+    """
+    if target_dir is None or not os.path.isdir(target_dir):
+        log.warning(f"target_dir {target_dir} does not exist. Skip evaluation.")
+        return {}
+    assert os.path.isdir(pred_dir), f"pred_dir {pred_dir} is not a directory."
+    targets = [
+        d.replace(".pdb", "") for d in os.listdir(target_dir)
+    ]
+    # pred_bases = os.listdir(pred_dir)
+    output_dir = pred_dir
+    tag = tag if tag is not None else "dev"
+    timestamp = strftime("%m%d-%H-%M")
+    fns = {
+        'val_clash': metrics.validity,
+        'val_bond': metrics.bonding_validity,
+        'js_pwd': metrics.js_pwd,
+        'js_rg': metrics.js_rg,
+        # 'js_tica_pos': metrics.js_tica_pos,
+        'w2_rmwd':  metrics.w2_rmwd,
+        # 'div_rmsd': metrics.div_rmsd,
+        'div_rmsf': metrics.div_rmsf,
+        'pro_w_contacks': metrics.pro_w_contacts,
+        'pro_t_contacks': metrics.pro_t_contacts,
+        # 'pro_c_contacks': metrics.pro_c_contacts,
+    }
+    eval_res = {k: {} for k in fns}
+    print(f"total_md_num = {len(targets)}")
+    count = 0
+    for target in targets:
+        count += 1
+        print("")
+        print(count, target)
+        pred_file = os.path.join(pred_dir, f"{target}.pdb")
+        # assert os.path.isfile(pred_file), f"pred_file {pred_file} does not exist."
+        if not os.path.isfile(pred_file):
+            continue
+        target_file = os.path.join(target_dir, f"{target}.pdb")
+        ca_coords = {
+            'target': extract_backbone_coords(target_file),
+            'pred': extract_backbone_coords(pred_file),
+        }
+        cry_target_file = os.path.join(crystal_dir, f"{target}.pdb")
+        cry_ca_coords = extract_backbone_coords(cry_target_file)[0]
+        for f_name, func in fns.items():
+            print(f_name)
+            if f_name == 'w2_rmwd':
+                v_ref  = torch.as_tensor(ca_coords['target'][0])
+                for k, v in ca_coords.items():
+                    v = torch.as_tensor(v)  # (250,356,3)
+                    for idx in range(v.shape[0]):
+                        R, t = _find_rigid_alignment(v[idx], v_ref)
+                        v[idx] = (torch.matmul(R, v[idx].transpose(-2, -1))).transpose(-2, -1) + t.unsqueeze(0)
+                    ca_coords[k] = v.numpy()
+            if f_name.startswith('js_'):
+                res = func(ca_coords, ref_key='target')
+            elif f_name == 'pro_c_contacks':
+                res = func(target_file, pred_file, cry_target_file)
+            elif f_name.startswith('pro_'):
+                res = func(ca_coords, cry_ca_coords)
+            else:
+                res = func(ca_coords)
+            if f_name == 'js_tica' or f_name == 'js_tica_pos':
+                pass
+                # eval_res[f_name][target] = res[0]['pred']
+                # save_to = os.path.join(output_dir, f"tica_{target}_{tag}_{timestamp}.png")
+                # plot_utils.scatterplot_2d(res[1], save_to=save_to, ref_key='target')
+            else:
+                eval_res[f_name][target] = res['pred']
+    csv_save_to = os.path.join(output_dir, f"metrics_{tag}_{timestamp}.csv")
+    df = pd.DataFrame.from_dict(eval_res) # row = target, col = metric name
+    df.to_csv(csv_save_to)
+    print(f"metrics saved to {csv_save_to}")
+    mean_metrics = np.around(df.mean(), decimals=4)
+    return mean_metrics
+# @task_wrapper
+# def evaluate(cfg: DictConfig) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+#     """Sample on a test set and report evaluation metrics.
+#     This method is wrapped in optional @task_wrapper decorator, that controls the behavior during
+#     failure. Useful for multiruns, saving info about the crash, etc.
+#     :param cfg: DictConfig configuration composed by Hydra.
+#     :return: Tuple[dict, dict] with metrics and dict with all instantiated objects.
+#     """
+#     # assert cfg.ckpt_path
+#     pred_dir = cfg.get("pred_dir")
+#     if pred_dir and os.path.isdir(pred_dir):
+#         log.info(f"Found pre-computed prediction directory {pred_dir}.")
+#         metric_dict = evaluate_prediction(pred_dir, target_dir=cfg.target_dir)
+#         return metric_dict, None
+#     log.info(f"Instantiating datamodule <{cfg.data._target_}>")
+#     datamodule: LightningDataModule = hydra.utils.instantiate(cfg.data)
+#     log.info(f"Instantiating model <{cfg.model._target_}>")
+#     model: LightningModule = hydra.utils.instantiate(cfg.model)
+#     log.info("Instantiating loggers...")
+#     logger: List[Logger] = instantiate_loggers(cfg.get("logger"))
+#     log.info(f"Instantiating trainer <{cfg.trainer._target_}>")
+#     trainer: Trainer = hydra.utils.instantiate(cfg.trainer, logger=logger)
+#     object_dict = {
+#         "cfg": cfg,
+#         "datamodule": datamodule,
+#         "model": model,
+#         "logger": logger,
+#         "trainer": trainer,
+#     }
+#     if logger:
+#         log.info("Logging hyperparameters!")
+#         log_hyperparameters(object_dict)
+#     # Load checkpoint manually.
+#     model, ckpt_path = checkpoint_utils.load_model_checkpoint(model, cfg.ckpt_path)
+#     # log.info("Starting testing!")
+#     # trainer.test(model=model, datamodule=datamodule, ckpt_path=cfg.ckpt_path)
+#     # Get dataloader for prediction.
+#     datamodule.setup(stage="predict")
+#     dataloaders = datamodule.test_dataloader()
+#     log.info("Starting predictions.")
+#     pred_dir = trainer.predict(model=model, dataloaders=dataloaders, ckpt_path=ckpt_path)[-1]
+#     # metric_dict = trainer.callback_metrics
+#     log.info("Starting evaluations.")
+#     metric_dict = evaluate_prediction(pred_dir, target_dir=cfg.target_dir)
+#     return metric_dict, object_dict
+# @hydra.main(version_base="1.3", config_path="../configs", config_name="eval.yaml")
+# def main(cfg: DictConfig) -> None:
+#     """Main entry point for evaluation.
+#     :param cfg: DictConfig configuration composed by Hydra.
+#     """
+#     # apply extra utilities
+#     # (e.g. ask for tags if none are provided in cfg, print cfg tree, etc.)
+#     extras(cfg)
+#     evaluate(cfg)
+# if __name__ == "__main__":
+#     main()