From 9647cda8c323d0245f09812c51b4577a742e1551 Mon Sep 17 00:00:00 2001 From: Tomas Gonzalez Zarzar Date: Fri, 30 Aug 2024 16:23:28 -0400 Subject: [PATCH] read excel LOD values, improve transformation, format; closes #8, closes #9 --- docs/index.md | 2 +- pyproject.toml | 3 +- src/metabo_adni/data/load.py | 52 +++++++++++++++++++-------- src/metabo_adni/qc/transformations.py | 23 ++++++------ 4 files changed, 52 insertions(+), 28 deletions(-) diff --git a/docs/index.md b/docs/index.md index ed6f130..fa566bb 100644 --- a/docs/index.md +++ b/docs/index.md @@ -59,7 +59,7 @@ Four datasets contain the proper data, divided by method (FIA, UPLC) and cohort - `ADMCDUKEP180FIA_01_15_16.csv` obtained from `ADMC Duke Biocrates P180 Kit Flow injection analysis [ADNI1]` item - `ADMCDUKEP180FIAADNI2GO.csv` obtained from `ADMC Duke Biocrates p180 Kit Flow injection analysis [ADNIGO,2]` item - `ADMCDUKEP180UPLC_01_15_16.csv` obtained from `ADMC Duke Biocrates P180 Kit Ultra Performance Liquid Chromatography [ADNI1]` item -- `ADMCDUKEP180UPLCADNI2GO.csv` obtained from `ADMC Duke p180 Ultra Performance Liquid Chromatography [ADNIGO,2]` item +- `ADMCDUKEP180UPLCADNI2GO.csv` obtained from `ADMC Duke p180 Ultra Performance Liquid Chromatography [ADNIGO,2]` item. **Note**: make sure to add a single quote (") at the end of this file, if not, pandas will not read correctly the file. #### LOD diff --git a/pyproject.toml b/pyproject.toml index 5c76a5c..885b436 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,7 @@ packages = [{ include = "metabo_adni", from = "src" }] python = "^3.12" pandas = "^2.2" pingouin = "0.5.4" +openpyxl = "^3.1" [tool.poetry.scripts] clean_files = "metabo_adni.__main__:main" @@ -29,4 +30,4 @@ build-backend = "poetry.core.masonry.api" [tool.pyright] include = ["src"] venv = "metabo_adni_env" -venvPath = "miniforge3/envs/" +venvPath = "home/tomas/miniforge3/envs/" diff --git a/src/metabo_adni/data/load.py b/src/metabo_adni/data/load.py index 2f7eb57..7f0a890 100644 --- a/src/metabo_adni/data/load.py +++ b/src/metabo_adni/data/load.py @@ -1,5 +1,6 @@ -import os import glob +import os + import numpy as np import pandas as pd @@ -50,9 +51,10 @@ def read_files(directory: str, platform: str) -> dict[str, pd.DataFrame]: for i, f in enumerate(file_names): if f in dir_files: key = list(platform_files.keys())[i] - dat = pd.DataFrame(pd.read_csv(f, na_values=na_values)).set_index( - index_cols - ) + dat = pd.read_csv( + f, + na_values=na_values, + ).set_index(index_cols) dat = dat.sort_index() if "ADNI2GO" in f: dat = _replace_bad_col_names(dat) @@ -114,26 +116,46 @@ def read_lod_files(directory: str) -> dict[str, pd.DataFrame]: "ADNI2GO-FIA": "", } filenames = [ - "P180UPLCLODvalues_ADNI1.csv", - "P180FIALODvalues_ADNI1.csv", - "P180UPLCLODvalues_ADNI2GO.csv", - "P180FIALODvalues_ADNI2GO.csv", + "4097_UPLC_p180_Data.xlsx", + "4097_FIA_p180_Data.xlsx", + "4610 UPLC p180 Data.xlsx", + "4610 FIA p180 Data.xlsx", ] for i, key in enumerate(lod_files): - dat = pd.DataFrame(pd.read_csv(filenames[i], encoding="latin_1")) + print(i, key) + if key == "ADNI2GO-UPLC": + dat = pd.read_excel( + io=filenames[i], + sheet_name=0, + header=0, + index_col=10, + skiprows=[0, 2], + nrows=1, + ).iloc[:, 10:] + else: + dat = pd.read_excel( + io=filenames[i], + sheet_name=0, + header=0, + index_col=10, + skiprows=[0, 2], + nrows=11, + ).iloc[:, 10:] # Metabolite names in lod don't match those in the data # Replace '-', ':', '(', ')' and ' ' with '.' dat = _replace_bad_col_names(dat) if "UPLC" in key: # Change metabolite name from Met.So to Met.SO dat.rename(columns={"Met.SO": "Met.So"}, inplace=True) - elif key == "ADNI2GO-FIA": - # In lod value ADNI2GO-FIA, the bar code plate needs fixing - barcode = dat["Plate.Bar.Code"] - barcode = barcode.str.split(" ", expand=True)[2].str.replace( - pat="/", repl="-" - ) + # In LOD files, the bar code plate needs fixing, + # except ADNI2GO-UPLC where it's only one + if "ADNI2GO-UPLC" != key: + barcode = dat.index + barcode = [ + l.replace("/", "-") for l in [x[2] for x in barcode.str.split(" ")] + ] dat["Plate.Bar.Code"] = barcode + dat = dat.reset_index(drop=True) lod_files[key] = dat return lod_files diff --git a/src/metabo_adni/qc/transformations.py b/src/metabo_adni/qc/transformations.py index 2ff7617..89839ab 100644 --- a/src/metabo_adni/qc/transformations.py +++ b/src/metabo_adni/qc/transformations.py @@ -35,11 +35,13 @@ def imputation( print("=== Imputing metabolites ===") total_points_imputed = 0 total_mets_imputed = [] + if lod_directory is not None: + lod_files = load.read_lod_files(lod_directory) for key in dat_dict: metabo_names = load._get_metabo_col_names(dat_dict[key], key) indices = load._get_data_indices(dat_dict[key], platform) - dat = dat_dict[key].loc[indices, metabo_names] + dat = dat_dict[key].loc[indices, list(metabo_names) + ["Plate.Bar.Code"]] mets_to_impute = dat.columns[dat.isna().any()] data_points_impute = dat.isna().sum().sum() total_mets_imputed.extend(mets_to_impute) @@ -51,16 +53,15 @@ def imputation( for j in mets_to_impute: indices = dat.loc[dat[j].isna()].index if platform == "p180" and lod_directory is not None: - lod_files = load.read_lod_files(lod_directory) - dat = dat_dict[key].loc[ - dat_dict[key].index < 99999, list(metabo_names) + ["Plate.Bar.Code"] - ] - barcode = pd.Series(dat.loc[indices, "Plate.Bar.Code"]) - vals = [] - for bar in barcode: - met_lod = lod_files[key].loc[:, "Plate.Bar.Code"] == bar - vals.append(lod_files[key].loc[met_lod, j]) - dat_dict[key].loc[indices, j] = np.mean(vals) * 0.5 + if key == "ADNI2GO-UPLC": + dat_dict[key].loc[indices, j] = lod_files[key].loc[:, j] + else: + barcode = pd.Series(dat.loc[indices, "Plate.Bar.Code"]) + for bar in barcode: + met_lod = lod_files[key].loc[:, "Plate.Bar.Code"] == bar + dat_dict[key].loc[indices, j] = ( + lod_files[key].loc[met_lod, j] / 2 + ) else: half_min = dat.loc[:, j].min() / 2 dat_dict[key].loc[indices, j] = half_min