Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open Betas #124

Merged
merged 6 commits into from
Nov 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ jobs:
R_LIBS_USER: ./r-libs

steps:
- uses: actions/checkout@v1
# - uses: actions/checkout@v1
- uses: actions/checkout@v2
with:
fetch-depth: 1

Expand All @@ -39,11 +40,14 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.7
# python-version: 3.7
python-version: '3.9'

- name: Install Poetry
uses: snok/[email protected]
# uses: snok/[email protected]
uses: snok/install-poetry@v1
with:
version: 1.5.1
virtualenvs-create: true
virtualenvs-in-project: true

Expand Down
10 changes: 5 additions & 5 deletions clarite/internal/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,13 @@ def _validate_skip_only(
):
"""Validate use of the 'skip' and 'only' parameters, returning a boolean series for the columns where True = use the column"""
# Ensure that 'data' is a DataFrame and not a Series
if type(data) != pd.DataFrame:
if not isinstance(data, pd.DataFrame):
raise ValueError("The passed 'data' is not a Pandas DataFrame")

# Convert string to a list
if type(skip) == str:
if isinstance(skip, str):
skip = [skip]
if type(only) == str:
if isinstance(only, str):
only = [only]

if skip is not None and only is not None:
Expand Down Expand Up @@ -204,7 +204,7 @@ def _remove_empty_categories(
Updates the data in-place and returns a dict of variables:removed categories
"""
removed_cats = dict()
if type(data) == pd.DataFrame:
if isinstance(data, pd.DataFrame):
columns = _validate_skip_only(data, skip, only)
dtypes = data.loc[:, columns].dtypes
catvars = [v for v in dtypes[dtypes == "category"].index]
Expand All @@ -219,7 +219,7 @@ def _remove_empty_categories(
if len(removed_categories) > 0:
removed_cats[var] = removed_categories
return removed_cats
elif type(data) == pd.Series:
elif isinstance(data, pd.Series):
assert skip is None
assert only is None
counts = data.value_counts()
Expand Down
2 changes: 1 addition & 1 deletion clarite/modules/analyze/regression/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def _validate_regression_params(self, regression_variables):
Validate standard regression parameters- data, outcome_variable, and covariates. Store relevant information.
"""
# Covariates must be a list
if type(self.covariates) != list:
if not isinstance(self.covariates, list):
raise ValueError("'covariates' must be specified as a list or set to None")

# Make sure the index of each dataset is not a multiindex and give it a consistent name
Expand Down
31 changes: 26 additions & 5 deletions clarite/modules/analyze/regression/interaction_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ def _get_default_result_dict(i1, i2, outcome_variable):
"Full_Var2_beta": np.nan,
"Full_Var2_SE": np.nan,
"Full_Var2_Pval": np.nan,
"Log": "",
}

def get_results(self) -> pd.DataFrame:
Expand Down Expand Up @@ -232,10 +233,19 @@ def _run_interaction_regression(
# in the result based on the specific requirements of the analysis
if lrdf == 0 and lrstat == 0:
# Both models are equal
yield {"Converged": False, "LRT_pvalue": lr_pvalue}
if np.isnan(lr_pvalue):
yield {
"Converged": True,
"LRT_pvalue": lr_pvalue,
"Log": "Both models are equivalent in terms of fit",
}
elif np.isnan(lr_pvalue):
# There is an issue with the LRT calculation
yield {"Converged": False, "LRT_pvalue": lr_pvalue}
# TODO: Extend the logs returns
yield {
"Converged": True,
"LRT_pvalue": lr_pvalue,
"Log": "Both models are equivalent in terms of fit",
}
else:
if report_betas:
# Get beta, SE, and pvalue from interaction terms
Expand Down Expand Up @@ -278,14 +288,20 @@ def _run_interaction_regression(
"Full_Var2_SE": est.bse[term_2],
"Full_Var2_Pval": est.pvalues[term_2],
"LRT_pvalue": lr_pvalue,
"Log": "",
}
else:
# Only return the LRT result
yield {"Converged": True, "LRT_pvalue": lr_pvalue}
yield {"Converged": True, "LRT_pvalue": lr_pvalue, "Log": ""}

else:
# Did not converge - nothing to update
yield dict()
# yield dict()
yield {
"Converged": False,
"LRT_pvalue": "NaN",
"Log": "One or Both models NOT Converge",
}

def _get_interaction_specific_data(self, interaction: Tuple[str, str]):
"""Select the data relevant to performing a regression on a given interaction, encoding genotypes if needed"""
Expand Down Expand Up @@ -407,6 +423,8 @@ def _run_interaction(
# Get complete case mask and filter by min_n
complete_case_mask = ~data.isna().any(axis=1)
N = complete_case_mask.sum()
if N == 0:
raise ValueError(f"No Overlap (min_n filter: {N} < {min_n})")
if N < min_n:
raise ValueError(
f"too few complete observations (min_n filter: {N} < {min_n})"
Expand Down Expand Up @@ -476,5 +494,8 @@ def _run_interaction(
error = str(e)
if result is None:
result_list = [cls._get_default_result_dict(i1, i2, outcome_variable)]
result_list[0]["Log"] = error
result_list[0]["Converged"] = "NA"
result_list[0]["N"] = N

return result_list, warnings_list, error
8 changes: 4 additions & 4 deletions clarite/modules/analyze/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,10 @@ def add_corrected_pvalues(
if pvalue not in data.columns:
raise ValueError(f"'{pvalue}' is not a column in the passed data")
if groupby is not None:
if type(groupby) == str:
if isinstance(groupby, str):
if (groupby not in data.columns) and (groupby not in data.index.names):
raise ValueError(f"'{groupby}' is not a column in the passed data")
elif type(groupby) == list:
elif isinstance(groupby, list):
for g in groupby:
if (g not in data.columns) and (g not in data.index.names):
raise ValueError(f"'{g}' is not a column in the passed data")
Expand Down Expand Up @@ -96,13 +96,13 @@ def add_corrected_pvalues(
# Expand results to duplicated rows
data[bonf_name] = data[groupby].apply(
lambda g: bonf_result.get(g, np.nan)
if type(g) == str
if isinstance(g, str)
else bonf_result.get(tuple(g.values), np.nan),
axis=1,
)
data[fdr_name] = data[groupby].apply(
lambda g: bonf_result.get(g, np.nan)
if type(g) == str
if isinstance(g, str)
else fdr_result.get(tuple(g.values), np.nan),
axis=1,
)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "clarite"
version = "2.3.5"
version = "2.3.6"
description = "CLeaning to Analysis: Reproducibility-based Interface for Traits and Exposures"
authors = ["Andre Rico <[email protected]>"]
license = "BSD-3-Clause"
Expand Down
55 changes: 28 additions & 27 deletions tests/analyze/test_gwas.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import numpy as np
import pandas as pd
# import numpy as np
# import pandas as pd
import pytest

import clarite
from clarite.modules.survey import SurveyDesignSpec

# from clarite.modules.survey import SurveyDesignSpec


def test_bams_main(genotype_case_control_add_add_main):
Expand All @@ -30,30 +31,30 @@ def test_bams_interaction(genotype_case_control_rec_rec_onlyinteraction):


# @pytest.mark.slow
@pytest.mark.parametrize("process_num", [None, 1])
def test_largeish_gwas(large_gwas_data, process_num):
"""10k samples with 1000 SNPs"""
# Run CLARITE GWAS
results = clarite.analyze.association_study(
data=large_gwas_data,
outcomes="Outcome",
encoding="additive",
process_num=process_num,
)
# Run CLARITE GWAS with fake (all ones) weights to confirm the weighted regression handles genotypes correctly
results_weighted = clarite.analyze.association_study(
data=large_gwas_data,
outcomes="Outcome",
encoding="additive",
process_num=process_num,
survey_design_spec=SurveyDesignSpec(
survey_df=pd.DataFrame({"weights": np.ones(len(large_gwas_data))}),
weights="weights",
),
)
assert results == results
assert results_weighted == results_weighted
# TODO: Add useful asserts rather than just making sure it runs
# @pytest.mark.parametrize("process_num", [None, 1])
# def test_largeish_gwas(large_gwas_data, process_num):
# """10k samples with 1000 SNPs"""
# # Run CLARITE GWAS
# results = clarite.analyze.association_study(
# data=large_gwas_data,
# outcomes="Outcome",
# encoding="additive",
# process_num=process_num,
# )
# # Run CLARITE GWAS with fake (all ones) weights to confirm the weighted regression handles genotypes correctly
# results_weighted = clarite.analyze.association_study(
# data=large_gwas_data,
# outcomes="Outcome",
# encoding="additive",
# process_num=process_num,
# survey_design_spec=SurveyDesignSpec(
# survey_df=pd.DataFrame({"weights": np.ones(len(large_gwas_data))}),
# weights="weights",
# ),
# )
# assert results == results
# assert results_weighted == results_weighted
# # TODO: Add useful asserts rather than just making sure it runs


@pytest.mark.xfail(strict=True)
Expand Down
88 changes: 17 additions & 71 deletions tests/analyze/test_interaction_study.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,80 +206,26 @@ def test_interactions_nhanes_pairwise(data_NHANES):
)
compare_result(loaded_result, python_result, rtol=1e-02)

# Test Adding pvalues
clarite.analyze.add_corrected_pvalues(python_result_nobeta, pvalue="LRT_pvalue")
clarite.analyze.add_corrected_pvalues(python_result, pvalue="Full_Var1_Var2_Pval")
clarite.analyze.add_corrected_pvalues(
python_result, pvalue="LRT_pvalue", groupby=["Term1", "Term2"]
)
# Ensure grouped pvalue corrections match
grouped_bonf = (
python_result.reset_index(drop=False)
.groupby(["Term1", "Term2", "Outcome"])["LRT_pvalue_bonferroni"]
.first()
)
grouped_fdr = (
python_result.reset_index(drop=False)
.groupby(["Term1", "Term2", "Outcome"])["LRT_pvalue_fdr"]
.first()
)

# TODO: Alter this test because nobeta did not open all categories
# assert (grouped_bonf == python_result_nobeta["LRT_pvalue_bonferroni"]).all()
# assert (grouped_fdr == python_result_nobeta["LRT_pvalue_fdr"]).all()
assert grouped_bonf == grouped_bonf
assert grouped_fdr == grouped_fdr


def test_interaction_exe():
nested_table = clarite.load.from_csv(
"/Users/andrerico/HALL/Python_3_10/clarite-python/tests/test_data_files/nested_table.csv"
)
# Return same result if not change data type
# list_bin = (
# "female",
# "black",
# "mexican",
# "other_hispanic",
# "other_eth",
# # Test Adding pvalues
# clarite.analyze.add_corrected_pvalues(python_result_nobeta, pvalue="LRT_pvalue")
# clarite.analyze.add_corrected_pvalues(python_result, pvalue="Full_Var1_Var2_Pval")
# clarite.analyze.add_corrected_pvalues(
# python_result, pvalue="LRT_pvalue", groupby=["Term1", "Term2"]
# )
# list_cat = (
# "SDDSRVYR",
# "SES_LEVEL",

# # Ensure grouped pvalue corrections match
# grouped_bonf = (
# python_result.reset_index(drop=False)
# .groupby(["Term1", "Term2", "Outcome"])["LRT_pvalue_bonferroni"]
# .first()
# )
# list_cont = (
# "BMXBMI",
# "RIDAGEYR",
# "LBXCOT",
# "IRON_mg",
# "DR1TSFAT",
# "DRDSDT1",
# grouped_fdr = (
# python_result.reset_index(drop=False)
# .groupby(["Term1", "Term2", "Outcome"])["LRT_pvalue_fdr"]
# .first()
# )

# nested_table = clarite.modify.make_binary(data=nested_table, only=(list_bin))
# nested_table = clarite.modify.make_categorical(data=nested_table, only=(list_cat))
# nested_table = clarite.modify.make_continuous(data=nested_table, only=(list_cont))

e1 = "DR1TSFAT"
e2 = "DRDSDT1"
list_covariant = [
"female",
"black",
"mexican",
"other_hispanic",
"other_eth",
"SDDSRVYR",
"BMXBMI",
"SES_LEVEL",
"RIDAGEYR",
"LBXCOT",
"IRON_mg",
]
retorno = clarite.analyze.interaction_study(
data=nested_table,
outcomes="LBXHGB",
interactions=[(e1, e2)],
covariates=list_covariant,
)

assert retorno == retorno
# assert (grouped_bonf == python_result_nobeta["LRT_pvalue_bonferroni"]).all()
# assert (grouped_fdr == python_result_nobeta["LRT_pvalue_fdr"]).all()
1 change: 1 addition & 0 deletions tests/on_demand/test_debug_pvalue.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def test_interactions_debug():
interactions=[(e1, e2)],
covariates=list_covariant,
report_betas=True,
min_n=8000,
)

print(df_inter)
Expand Down
Binary file modified tests/py_test_output/top_results_nhanesreal.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/py_test_output/top_results_nhanesreal_no_cutoff.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/py_test_output/top_results_nhanessmall.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading