-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdemo_run.py
168 lines (135 loc) · 5.5 KB
/
demo_run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
"""
Enzyme Kinetics Parameter Prediction Script
This script predicts enzyme kinetics parameters (kcat, Km, or Ki) using a pre-trained model.
It processes input data, generates predictions, and saves the results.
Usage:
python script_name.py --parameter <kcat|km|ki> --input_file <path_to_input_csv> [--use_gpu]
Dependencies:
pandas, numpy, rdkit, IPython, argparse
"""
import time
import os
import pandas as pd
import numpy as np
from IPython.display import Image, display
from rdkit import Chem
from IPython.display import display, Latex, Math
import argparse
def create_csv_sh(parameter, input_file_path):
"""
Process input data and create a shell script for prediction.
Args:
parameter (str): The kinetics parameter to predict.
input_file_path (str): Path to the input CSV file.
Returns:
str: Path to the output CSV file.
"""
df = pd.read_csv(input_file_path)
smiles_list = df.SMILES
seq_list = df.sequence
smiles_list_new = []
# Process SMILES strings
for i, smi in enumerate(smiles_list):
try:
mol = Chem.MolFromSmiles(smi)
smi = Chem.MolToSmiles(mol)
if parameter == 'kcat' and '.' in smi:
smi = '.'.join(sorted(smi.split('.')))
smiles_list_new.append(smi)
except:
print(f'Invalid SMILES input in input row {i}')
print('Correct your input! Exiting..')
return None
# Validate enzyme sequences
valid_aas = set('ACDEFGHIKLMNPQRSTVWY')
for i, seq in enumerate(seq_list):
if not set(seq).issubset(valid_aas):
print(f'Invalid Enzyme sequence input in row {i}!')
print('Correct your input! Exiting..')
return None
# Save processed input
input_file_new_path = f'{input_file_path[:-4]}_input.csv'
df['SMILES'] = smiles_list_new
df.to_csv(input_file_new_path)
# Create shell script for prediction
with open('predict.sh', 'w') as f:
f.write(f'''
TEST_FILE_PREFIX={input_file_new_path[:-4]}
RECORDS_FILE=${{TEST_FILE_PREFIX}}.json
CHECKPOINT_DIR=./production_models/{parameter}/
python ./scripts/create_pdbrecords.py --data_file ${{TEST_FILE_PREFIX}}.csv --out_file ${{RECORDS_FILE}}
python predict.py --test_path ${{TEST_FILE_PREFIX}}.csv --preds_path ${{TEST_FILE_PREFIX}}_output.csv --checkpoint_dir $CHECKPOINT_DIR --uncertainty_method mve --smiles_column SMILES --individual_ensemble_predictions --protein_records_path $RECORDS_FILE
''')
return input_file_new_path[:-4]+'_output.csv'
def get_predictions(parameter, outfile):
"""
Process prediction results and add additional metrics.
Args:
parameter (str): The kinetics parameter that was predicted.
outfile (str): Path to the output CSV file from the prediction.
Returns:
pandas.DataFrame: Processed predictions with additional metrics.
"""
df = pd.read_csv(outfile)
pred_col, pred_logcol, pred_sd_totcol, pred_sd_aleacol, pred_sd_epicol = [], [], [], [], []
unit = 'mM'
if parameter == 'kcat':
target_col = 'log10kcat_max'
unit = 's^(-1)'
elif parameter == 'km':
target_col = 'log10km_mean'
else:
target_col = 'log10ki_mean'
unc_col = f'{target_col}_mve_uncal_var'
for _, row in df.iterrows():
model_cols = [col for col in row.index if col.startswith(target_col) and 'model_' in col]
unc = row[unc_col]
prediction = row[target_col]
prediction_linear = np.power(10, prediction)
model_outs = np.array([row[col] for col in model_cols])
epi_unc = np.var(model_outs)
alea_unc = unc - epi_unc
epi_unc = np.sqrt(epi_unc)
alea_unc = np.sqrt(alea_unc)
unc = np.sqrt(unc)
pred_col.append(prediction_linear)
pred_logcol.append(prediction)
pred_sd_totcol.append(unc)
pred_sd_aleacol.append(alea_unc)
pred_sd_epicol.append(epi_unc)
df[f'Prediction_({unit})'] = pred_col
df['Prediction_log10'] = pred_logcol
df['SD_total'] = pred_sd_totcol
df['SD_aleatoric'] = pred_sd_aleacol
df['SD_epistemic'] = pred_sd_epicol
return df
def main(args):
"""
Main function to run the prediction process.
Args:
args (argparse.Namespace): Command-line arguments.
"""
print(os.getcwd())
outfile = create_csv_sh(args.parameter, args.input_file)
if outfile is None:
return
print('Predicting.. This will take a while..\n')
if args.use_gpu:
os.system("export PROTEIN_EMBED_USE_CPU=0;./predict.sh")
else:
os.system("export PROTEIN_EMBED_USE_CPU=1;./predict.sh")
output_final = get_predictions(args.parameter, outfile)
filename = outfile.split('/')[-1]
output_final.to_csv(f'../results/{filename}')
print('Output saved to results/', filename)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Predict enzyme kinetics parameters.")
parser.add_argument("--parameter", type=str, choices=["kcat", "km", "ki"], required=True,
help="Kinetics parameter to predict (kcat, km, or ki)")
parser.add_argument("--input_file", type=str, required=True,
help="Path to the input CSV file")
parser.add_argument("--use_gpu", action="store_true",
help="Use GPU for prediction (default is CPU)")
args = parser.parse_args()
args.parameter = args.parameter.lower()
main(args)