Skip to content

Commit

Permalink
read excel LOD values, improve transformation, format; closes #8, closes
Browse files Browse the repository at this point in the history
  • Loading branch information
tomszar committed Aug 30, 2024
1 parent 5385933 commit 9647cda
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 28 deletions.
2 changes: 1 addition & 1 deletion docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ Four datasets contain the proper data, divided by method (FIA, UPLC) and cohort
- `ADMCDUKEP180FIA_01_15_16.csv` obtained from `ADMC Duke Biocrates P180 Kit Flow injection analysis [ADNI1]` item
- `ADMCDUKEP180FIAADNI2GO.csv` obtained from `ADMC Duke Biocrates p180 Kit Flow injection analysis [ADNIGO,2]` item
- `ADMCDUKEP180UPLC_01_15_16.csv` obtained from `ADMC Duke Biocrates P180 Kit Ultra Performance Liquid Chromatography [ADNI1]` item
- `ADMCDUKEP180UPLCADNI2GO.csv` obtained from `ADMC Duke p180 Ultra Performance Liquid Chromatography [ADNIGO,2]` item
- `ADMCDUKEP180UPLCADNI2GO.csv` obtained from `ADMC Duke p180 Ultra Performance Liquid Chromatography [ADNIGO,2]` item. **Note**: make sure to add a single quote (") at the end of this file, if not, pandas will not read correctly the file.

#### LOD

Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ packages = [{ include = "metabo_adni", from = "src" }]
python = "^3.12"
pandas = "^2.2"
pingouin = "0.5.4"
openpyxl = "^3.1"

[tool.poetry.scripts]
clean_files = "metabo_adni.__main__:main"
Expand All @@ -29,4 +30,4 @@ build-backend = "poetry.core.masonry.api"
[tool.pyright]
include = ["src"]
venv = "metabo_adni_env"
venvPath = "miniforge3/envs/"
venvPath = "home/tomas/miniforge3/envs/"
52 changes: 37 additions & 15 deletions src/metabo_adni/data/load.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import glob
import os

import numpy as np
import pandas as pd

Expand Down Expand Up @@ -50,9 +51,10 @@ def read_files(directory: str, platform: str) -> dict[str, pd.DataFrame]:
for i, f in enumerate(file_names):
if f in dir_files:
key = list(platform_files.keys())[i]
dat = pd.DataFrame(pd.read_csv(f, na_values=na_values)).set_index(
index_cols
)
dat = pd.read_csv(
f,
na_values=na_values,
).set_index(index_cols)
dat = dat.sort_index()
if "ADNI2GO" in f:
dat = _replace_bad_col_names(dat)
Expand Down Expand Up @@ -114,26 +116,46 @@ def read_lod_files(directory: str) -> dict[str, pd.DataFrame]:
"ADNI2GO-FIA": "",
}
filenames = [
"P180UPLCLODvalues_ADNI1.csv",
"P180FIALODvalues_ADNI1.csv",
"P180UPLCLODvalues_ADNI2GO.csv",
"P180FIALODvalues_ADNI2GO.csv",
"4097_UPLC_p180_Data.xlsx",
"4097_FIA_p180_Data.xlsx",
"4610 UPLC p180 Data.xlsx",
"4610 FIA p180 Data.xlsx",
]
for i, key in enumerate(lod_files):
dat = pd.DataFrame(pd.read_csv(filenames[i], encoding="latin_1"))
print(i, key)
if key == "ADNI2GO-UPLC":
dat = pd.read_excel(
io=filenames[i],
sheet_name=0,
header=0,
index_col=10,
skiprows=[0, 2],
nrows=1,
).iloc[:, 10:]
else:
dat = pd.read_excel(
io=filenames[i],
sheet_name=0,
header=0,
index_col=10,
skiprows=[0, 2],
nrows=11,
).iloc[:, 10:]
# Metabolite names in lod don't match those in the data
# Replace '-', ':', '(', ')' and ' ' with '.'
dat = _replace_bad_col_names(dat)
if "UPLC" in key:
# Change metabolite name from Met.So to Met.SO
dat.rename(columns={"Met.SO": "Met.So"}, inplace=True)
elif key == "ADNI2GO-FIA":
# In lod value ADNI2GO-FIA, the bar code plate needs fixing
barcode = dat["Plate.Bar.Code"]
barcode = barcode.str.split(" ", expand=True)[2].str.replace(
pat="/", repl="-"
)
# In LOD files, the bar code plate needs fixing,
# except ADNI2GO-UPLC where it's only one
if "ADNI2GO-UPLC" != key:
barcode = dat.index
barcode = [
l.replace("/", "-") for l in [x[2] for x in barcode.str.split(" ")]
]
dat["Plate.Bar.Code"] = barcode
dat = dat.reset_index(drop=True)
lod_files[key] = dat
return lod_files

Expand Down
23 changes: 12 additions & 11 deletions src/metabo_adni/qc/transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,13 @@ def imputation(
print("=== Imputing metabolites ===")
total_points_imputed = 0
total_mets_imputed = []
if lod_directory is not None:
lod_files = load.read_lod_files(lod_directory)

for key in dat_dict:
metabo_names = load._get_metabo_col_names(dat_dict[key], key)
indices = load._get_data_indices(dat_dict[key], platform)
dat = dat_dict[key].loc[indices, metabo_names]
dat = dat_dict[key].loc[indices, list(metabo_names) + ["Plate.Bar.Code"]]
mets_to_impute = dat.columns[dat.isna().any()]
data_points_impute = dat.isna().sum().sum()
total_mets_imputed.extend(mets_to_impute)
Expand All @@ -51,16 +53,15 @@ def imputation(
for j in mets_to_impute:
indices = dat.loc[dat[j].isna()].index
if platform == "p180" and lod_directory is not None:
lod_files = load.read_lod_files(lod_directory)
dat = dat_dict[key].loc[
dat_dict[key].index < 99999, list(metabo_names) + ["Plate.Bar.Code"]
]
barcode = pd.Series(dat.loc[indices, "Plate.Bar.Code"])
vals = []
for bar in barcode:
met_lod = lod_files[key].loc[:, "Plate.Bar.Code"] == bar
vals.append(lod_files[key].loc[met_lod, j])
dat_dict[key].loc[indices, j] = np.mean(vals) * 0.5
if key == "ADNI2GO-UPLC":
dat_dict[key].loc[indices, j] = lod_files[key].loc[:, j]
else:
barcode = pd.Series(dat.loc[indices, "Plate.Bar.Code"])
for bar in barcode:
met_lod = lod_files[key].loc[:, "Plate.Bar.Code"] == bar
dat_dict[key].loc[indices, j] = (
lod_files[key].loc[met_lod, j] / 2
)
else:
half_min = dat.loc[:, j].min() / 2
dat_dict[key].loc[indices, j] = half_min
Expand Down

0 comments on commit 9647cda

Please sign in to comment.