From 0df7c5cf7e5f36a5ff3ea3354f5171b2d8a37ae9 Mon Sep 17 00:00:00 2001 From: <> Date: Sun, 5 Nov 2023 07:25:57 +0000 Subject: [PATCH] Deployed 4e3d966 with MkDocs version: 1.5.3 --- python_api/step/finngen/index.html | 44 +++++++++++------------------ search/search_index.json | 2 +- sitemap.xml.gz | Bin 127 -> 127 bytes 3 files changed, 18 insertions(+), 28 deletions(-) diff --git a/python_api/step/finngen/index.html b/python_api/step/finngen/index.html index 00c66c0b1..a887cb280 100644 --- a/python_api/step/finngen/index.html +++ b/python_api/step/finngen/index.html @@ -52,12 +52,7 @@ 66 67 68 -69 -70 -71 -72 -73 -74
@dataclass
+69
@dataclass
class FinnGenStep:
"""FinnGen ingestion step.
@@ -82,39 +77,34 @@
def __post_init__(self: FinnGenStep) -> None:
"""Run step."""
- # Read the JSON data from the URL.
+ # Fetch study index.
json_data = urlopen(self.finngen_phenotype_table_url).read().decode("utf-8")
rdd = self.session.spark.sparkContext.parallelize([json_data])
df = self.session.spark.read.json(rdd)
-
- # Parse the study index data.
- finngen_studies = FinnGenStudyIndex.from_source(
+ # Process study index.
+ study_index = FinnGenStudyIndex.from_source(
df,
self.finngen_release_prefix,
self.finngen_sumstat_url_prefix,
self.finngen_sumstat_url_suffix,
)
-
- # Write the study index output.
- finngen_studies.df.write.mode(self.session.write_mode).parquet(
+ # Write study index.
+ study_index.df.write.mode(self.session.write_mode).parquet(
self.finngen_study_index_out
)
- # Prepare list of files for ingestion.
- input_filenames = [
- row.summarystatsLocation for row in finngen_studies.collect()
- ]
+ # Fetch summary stats.
+ input_filenames = [row.summarystatsLocation for row in study_index.collect()]
summary_stats_df = self.session.spark.read.option("delimiter", "\t").csv(
input_filenames, header=True
)
-
- # Specify data processing instructions.
- summary_stats_df = FinnGenSummaryStats.from_finngen_harmonized_summary_stats(
- summary_stats_df
- ).df
-
- # Sort and partition for output.
- summary_stats_df.sortWithinPartitions("position").write.partitionBy(
- "studyId", "chromosome"
- ).mode(self.session.write_mode).parquet(self.finngen_summary_stats_out)
+ # Process summary stats.
+ summary_stats_df = FinnGenSummaryStats.from_source(summary_stats_df).df
+ # Write summary stats.
+ (
+ summary_stats_df.sortWithinPartitions("position")
+ .write.partitionBy("studyId", "chromosome")
+ .mode(self.session.write_mode)
+ .parquet(self.finngen_summary_stats_out)
+ )