read excel LOD values, improve transformation, format; closes #8, closes

#9
tomszar · Aug 30, 2024 · 9647cda · 9647cda
1 parent 5385933
commit 9647cda
Show file tree

Hide file tree

Showing 4 changed files with 52 additions and 28 deletions.
diff --git a/docs/index.md b/docs/index.md
@@ -59,7 +59,7 @@ Four datasets contain the proper data, divided by method (FIA, UPLC) and cohort
 - `ADMCDUKEP180FIA_01_15_16.csv` obtained from `ADMC Duke Biocrates P180 Kit Flow injection analysis [ADNI1]` item
 - `ADMCDUKEP180FIAADNI2GO.csv` obtained from `ADMC Duke Biocrates p180 Kit Flow injection analysis [ADNIGO,2]` item
 - `ADMCDUKEP180UPLC_01_15_16.csv` obtained from `ADMC Duke Biocrates P180 Kit Ultra Performance Liquid Chromatography [ADNI1]` item
-- `ADMCDUKEP180UPLCADNI2GO.csv` obtained from `ADMC Duke p180 Ultra Performance Liquid Chromatography [ADNIGO,2]` item
+- `ADMCDUKEP180UPLCADNI2GO.csv` obtained from `ADMC Duke p180 Ultra Performance Liquid Chromatography [ADNIGO,2]` item. **Note**: make sure to add a single quote (") at the end of this file, if not, pandas will not read correctly the file.
 
 #### LOD
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -18,6 +18,7 @@ packages = [{ include = "metabo_adni", from = "src" }]
 python = "^3.12"
 pandas = "^2.2"
 pingouin = "0.5.4"
+openpyxl = "^3.1"
 
 [tool.poetry.scripts]
 clean_files = "metabo_adni.__main__:main"
@@ -29,4 +30,4 @@ build-backend = "poetry.core.masonry.api"
 [tool.pyright]
 include = ["src"]
 venv = "metabo_adni_env"
-venvPath = "miniforge3/envs/"
+venvPath = "home/tomas/miniforge3/envs/"
diff --git a/src/metabo_adni/data/load.py b/src/metabo_adni/data/load.py
@@ -1,5 +1,6 @@
-import os
 import glob
+import os
+
 import numpy as np
 import pandas as pd
 
@@ -50,9 +51,10 @@ def read_files(directory: str, platform: str) -> dict[str, pd.DataFrame]:
     for i, f in enumerate(file_names):
         if f in dir_files:
             key = list(platform_files.keys())[i]
-            dat = pd.DataFrame(pd.read_csv(f, na_values=na_values)).set_index(
-                index_cols
-            )
+            dat = pd.read_csv(
+                f,
+                na_values=na_values,
+            ).set_index(index_cols)
             dat = dat.sort_index()
             if "ADNI2GO" in f:
                 dat = _replace_bad_col_names(dat)
@@ -114,26 +116,46 @@ def read_lod_files(directory: str) -> dict[str, pd.DataFrame]:
         "ADNI2GO-FIA": "",
     }
     filenames = [
-        "P180UPLCLODvalues_ADNI1.csv",
-        "P180FIALODvalues_ADNI1.csv",
-        "P180UPLCLODvalues_ADNI2GO.csv",
-        "P180FIALODvalues_ADNI2GO.csv",
+        "4097_UPLC_p180_Data.xlsx",
+        "4097_FIA_p180_Data.xlsx",
+        "4610 UPLC p180 Data.xlsx",
+        "4610 FIA p180 Data.xlsx",
     ]
     for i, key in enumerate(lod_files):
-        dat = pd.DataFrame(pd.read_csv(filenames[i], encoding="latin_1"))
+        print(i, key)
+        if key == "ADNI2GO-UPLC":
+            dat = pd.read_excel(
+                io=filenames[i],
+                sheet_name=0,
+                header=0,
+                index_col=10,
+                skiprows=[0, 2],
+                nrows=1,
+            ).iloc[:, 10:]
+        else:
+            dat = pd.read_excel(
+                io=filenames[i],
+                sheet_name=0,
+                header=0,
+                index_col=10,
+                skiprows=[0, 2],
+                nrows=11,
+            ).iloc[:, 10:]
         # Metabolite names in lod don't match those in the data
         # Replace '-', ':', '(', ')' and ' ' with '.'
         dat = _replace_bad_col_names(dat)
         if "UPLC" in key:
             # Change metabolite name from Met.So to Met.SO
             dat.rename(columns={"Met.SO": "Met.So"}, inplace=True)
-        elif key == "ADNI2GO-FIA":
-            # In lod value ADNI2GO-FIA, the bar code plate needs fixing
-            barcode = dat["Plate.Bar.Code"]
-            barcode = barcode.str.split(" ", expand=True)[2].str.replace(
-                pat="/", repl="-"
-            )
+        # In LOD files, the bar code plate needs fixing,
+        # except ADNI2GO-UPLC where it's only one
+        if "ADNI2GO-UPLC" != key:
+            barcode = dat.index
+            barcode = [
+                l.replace("/", "-") for l in [x[2] for x in barcode.str.split(" ")]
+            ]
             dat["Plate.Bar.Code"] = barcode
+            dat = dat.reset_index(drop=True)
         lod_files[key] = dat
     return lod_files
 

diff --git a/src/metabo_adni/qc/transformations.py b/src/metabo_adni/qc/transformations.py
@@ -35,11 +35,13 @@ def imputation(
     print("=== Imputing metabolites ===")
     total_points_imputed = 0
     total_mets_imputed = []
+    if lod_directory is not None:
+        lod_files = load.read_lod_files(lod_directory)
 
     for key in dat_dict:
         metabo_names = load._get_metabo_col_names(dat_dict[key], key)
         indices = load._get_data_indices(dat_dict[key], platform)
-        dat = dat_dict[key].loc[indices, metabo_names]
+        dat = dat_dict[key].loc[indices, list(metabo_names) + ["Plate.Bar.Code"]]
         mets_to_impute = dat.columns[dat.isna().any()]
         data_points_impute = dat.isna().sum().sum()
         total_mets_imputed.extend(mets_to_impute)
@@ -51,16 +53,15 @@ def imputation(
         for j in mets_to_impute:
             indices = dat.loc[dat[j].isna()].index
             if platform == "p180" and lod_directory is not None:
-                lod_files = load.read_lod_files(lod_directory)
-                dat = dat_dict[key].loc[
-                    dat_dict[key].index < 99999, list(metabo_names) + ["Plate.Bar.Code"]
-                ]
-                barcode = pd.Series(dat.loc[indices, "Plate.Bar.Code"])
-                vals = []
-                for bar in barcode:
-                    met_lod = lod_files[key].loc[:, "Plate.Bar.Code"] == bar
-                    vals.append(lod_files[key].loc[met_lod, j])
-                dat_dict[key].loc[indices, j] = np.mean(vals) * 0.5
+                if key == "ADNI2GO-UPLC":
+                    dat_dict[key].loc[indices, j] = lod_files[key].loc[:, j]
+                else:
+                    barcode = pd.Series(dat.loc[indices, "Plate.Bar.Code"])
+                    for bar in barcode:
+                        met_lod = lod_files[key].loc[:, "Plate.Bar.Code"] == bar
+                        dat_dict[key].loc[indices, j] = (
+                            lod_files[key].loc[met_lod, j] / 2
+                        )
             else:
                 half_min = dat.loc[:, j].min() / 2
                 dat_dict[key].loc[indices, j] = half_min