Spaces:
Runtime error
Runtime error
| import itertools as it | |
| import os | |
| import joblib | |
| import numpy as np | |
| import pandas as pd | |
| import pkg_resources | |
| import streamlit as st | |
| from b3clf.descriptor_padel import compute_descriptors | |
| from b3clf.geometry_opt import geometry_optimize | |
| from b3clf.utils import get_descriptors, scale_descriptors, select_descriptors | |
| def load_all_models(): | |
| """Get b3clf fitted classifier""" | |
| clf_list = ["dtree", "knn", "logreg", "xgb"] | |
| sampling_list = [ | |
| "borderline_SMOTE", | |
| "classic_ADASYN", | |
| "classic_RandUndersampling", | |
| "classic_SMOTE", | |
| "kmeans_SMOTE", | |
| "common", | |
| ] | |
| model_dict = {} | |
| package_name = "b3clf" | |
| for clf_str, sampling_str in it.product(clf_list, sampling_list): | |
| # joblib_fpath = os.path.join( | |
| # dirname, "pre_trained", "b3clf_{}_{}.joblib".format(clf_str, sampling_str)) | |
| # pred_model = joblib.load(joblib_fpath) | |
| joblib_path_str = f"pre_trained/b3clf_{clf_str}_{sampling_str}.joblib" | |
| with pkg_resources.resource_stream(package_name, joblib_path_str) as f: | |
| pred_model = joblib.load(f) | |
| model_dict[clf_str + "_" + sampling_str] = pred_model | |
| return model_dict | |
| def predict_permeability( | |
| clf_str, sampling_str, _models_dict, mol_features, info_df, threshold="none" | |
| ): | |
| """Compute permeability prediction for given feature data.""" | |
| # load the model | |
| # pred_model = load_all_models()[clf_str + "_" + sampling_str] | |
| pred_model = _models_dict[clf_str + "_" + sampling_str] | |
| # load the threshold data | |
| package_name = "b3clf" | |
| with pkg_resources.resource_stream(package_name, "data/B3clf_thresholds.xlsx") as f: | |
| df_thres = pd.read_excel(f, index_col=0, engine="openpyxl") | |
| # default threshold is 0.5 | |
| label_pool = np.zeros(mol_features.shape[0], dtype=int) | |
| if type(mol_features) == pd.DataFrame: | |
| if mol_features.index.tolist() != info_df.index.tolist(): | |
| raise ValueError("Features_df and Info_df do not have the same index.") | |
| # get predicted probabilities | |
| info_df.loc[:, "B3clf_predicted_probability"] = pred_model.predict_proba( | |
| mol_features | |
| )[:, 1] | |
| # get predicted label from probability using the threshold | |
| mask = np.greater_equal( | |
| info_df["B3clf_predicted_probability"].to_numpy(), | |
| # df_thres.loc[clf_str + "-" + sampling_str, threshold]) | |
| df_thres.loc["xgb-classic_ADASYN", threshold], | |
| ) | |
| label_pool[mask] = 1 | |
| # save the predicted labels | |
| info_df["B3clf_predicted_label"] = label_pool | |
| info_df.reset_index(inplace=True) | |
| return info_df | |
| def generate_predictions( | |
| input_fname: str = None, | |
| sep: str = "\s+|\t+", | |
| clf: str = "xgb", | |
| _models_dict: dict = None, | |
| keep_sdf: str = "no", | |
| sampling: str = "classic_ADASYN", | |
| time_per_mol: int = 120, | |
| mol_features: pd.DataFrame = None, | |
| info_df: pd.DataFrame = None, | |
| ): | |
| """ | |
| Generate predictions for a given input file. | |
| """ | |
| if mol_features is None and info_df is None: | |
| # mol_tag = os.path.splitext(uploaded_file.name)[0] | |
| # uploaded_file = uploaded_file.read().decode("utf-8") | |
| mol_tag = os.path.basename(input_fname).split(".")[0] | |
| internal_sdf = f"{mol_tag}_optimized_3d.sdf" | |
| # Geometry optimization | |
| # Input: | |
| # * Either an SDF file with molecular geometries or a text file with SMILES strings | |
| geometry_optimize(input_fname=input_fname, output_sdf=internal_sdf, sep=sep) | |
| df_features = compute_descriptors( | |
| sdf_file=internal_sdf, | |
| excel_out=None, | |
| output_csv=None, | |
| timeout=None, | |
| time_per_molecule=time_per_mol, | |
| ) | |
| # Get computed descriptors | |
| mol_features, info_df = get_descriptors(df=df_features) | |
| # Select descriptors | |
| mol_features = select_descriptors(df=mol_features) | |
| # Scale descriptors | |
| mol_features.iloc[:, :] = scale_descriptors(df=mol_features) | |
| # this is problematic for using the same file for calculation | |
| if os.path.exists(internal_sdf) and keep_sdf == "no": | |
| os.remove(internal_sdf) | |
| # Get classifier | |
| # clf = get_clf(clf_str=clf, sampling_str=sampling) | |
| # Get classifier | |
| result_df = predict_permeability( | |
| clf_str=clf, | |
| sampling_str=sampling, | |
| _models_dict=_models_dict, | |
| mol_features=mol_features, | |
| info_df=info_df, | |
| threshold="none", | |
| ) | |
| # Get classifier | |
| display_cols = [ | |
| "ID", | |
| "SMILES", | |
| "B3clf_predicted_probability", | |
| "B3clf_predicted_label", | |
| ] | |
| result_df = result_df[ | |
| [col for col in result_df.columns.to_list() if col in display_cols] | |
| ] | |
| return mol_features, info_df, result_df | |