Skip to content

Commit

Permalink
Include access_role value in genomic sequencing ETL
Browse files Browse the repository at this point in the history
An `access_role` column was recently added to the `sample`, `sequencing_read_set`, `genomic_sequence`, and `consensus_genome` tables of the `warehouse` schema to allow data with restricted access to be included in these tables. These changes include `access_role` in the appropriate class definitions, and updates ETLs to propogate the `access_role` value found in the `sample` record to the related records.

The sample record will continue server as the primary `access_role` value, but must also be stored in each table that uses row level security.
  • Loading branch information
davereinhart committed Nov 27, 2023
1 parent 5dc3023 commit f1de9c0
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 12 deletions.
4 changes: 2 additions & 2 deletions lib/id3c/cli/command/etl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ def update_sample(db: DatabaseSession,
update warehouse.sample
set encounter_id = %s
where sample_id = %s
returning sample_id as id, identifier
returning sample_id as id, identifier, access_role
""", (encounter_id, sample.id))

assert sample.id, "Updating encounter_id affected no rows!"
Expand Down Expand Up @@ -341,7 +341,7 @@ def find_sample(db: DatabaseSession, identifier: str, for_update = True) -> Any:
query_ending = "for update"

sample = db.fetch_row("""
select sample_id as id, identifier, encounter_id
select sample_id as id, identifier, encounter_id, access_role
from warehouse.sample
where identifier = %s or
collection_identifier = %s
Expand Down
23 changes: 14 additions & 9 deletions lib/id3c/cli/command/etl/consensus_genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def find_or_create_sequence_read_set(db: DatabaseSession, document: dict, sample
"""))

sequence_read_set: SequenceReadSetRecord = db.fetch_row("""
select sequence_read_set_id as id, sample_id, urls
select sequence_read_set_id as id, sample_id, urls, access_role
from warehouse.sequence_read_set
where sample_id = %s
and urls @> %s
Expand All @@ -147,20 +147,24 @@ def find_or_create_sequence_read_set(db: DatabaseSession, document: dict, sample

if sequence_read_set:
LOG.info(f"Found sequence read set {sequence_read_set.id}")
if sample.access_role:
assert sample.access_role == sequence_read_set.access_role, \
f"Access_role for sample id «{sample.id}» does not match sequence read set id «{sequence_read_set.id}» "
else:
LOG.debug(dedent(f"""
Sequence read set not found for sample id «{sample.id}» and urls {urls}
"""))

data = {
"sample_id": sample.id,
"access_role": sample.access_role,
"urls": urls,
}

sequence_read_set = db.fetch_row("""
insert into warehouse.sequence_read_set (sample_id, urls)
values (%(sample_id)s, %(urls)s)
returning sequence_read_set_id as id, sample_id, urls
insert into warehouse.sequence_read_set (sample_id, urls, access_role)
values (%(sample_id)s, %(urls)s, %(access_role)s)
returning sequence_read_set_id as id, sample_id, urls, access_role
""", data)

LOG.info(f"Created sequence read set {sequence_read_set.id}")
Expand Down Expand Up @@ -254,19 +258,20 @@ def upsert_genome(db: DatabaseSession, sequence_read_set: SequenceReadSetRecord,
"sample_id": sequence_read_set.sample_id,
"organism_id": organism.id,
"sequence_read_set_id": sequence_read_set.id,
"additional_details": Json(document['summary_stats'])
"additional_details": Json(document['summary_stats']),
"access_role": sequence_read_set.access_role
}

genome: GenomeRecord = db.fetch_row("""
insert into warehouse.consensus_genome (sample_id, organism_id,
sequence_read_set_id, details)
sequence_read_set_id, details, access_role)
values (%(sample_id)s, %(organism_id)s, %(sequence_read_set_id)s,
%(additional_details)s)
%(additional_details)s, %(access_role)s)
on conflict (sample_id, organism_id, sequence_read_set_id) do update
set details = %(additional_details)s
set details = %(additional_details)s, access_role = %(access_role)s
returning consensus_genome_id as id, sample_id, organism_id, sequence_read_set_id
returning consensus_genome_id as id, sample_id, organism_id, sequence_read_set_id, access_role
""", data)

assert genome.id, "Upsert affected no rows!"
Expand Down
3 changes: 2 additions & 1 deletion lib/id3c/cli/command/etl/kit.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,8 @@ def find_sample(db: DatabaseSession, identifier: str) -> Optional[SampleRecord]:
select sample_id as id,
identifier,
encounter_id,
details ->> 'sample_type' as type
details ->> 'sample_type' as type,
access_role
from warehouse.sample
where sample.identifier = %s
""", (identifier,))
Expand Down
4 changes: 4 additions & 0 deletions lib/id3c/db/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,14 @@ class IdentifierRecord(NamedTuple):
class MinimalSampleRecord(NamedTuple):
id: int
identifier: str
access_role: Optional[str]

class SampleRecord(NamedTuple):
id: int
identifier: str
encounter_id: Optional[int]
type: Optional[str]
access_role: Optional[str]

class KitRecord(NamedTuple):
id: int
Expand All @@ -36,12 +38,14 @@ class SequenceReadSetRecord(NamedTuple):
id: int
sample_id: int
urls: Optional[List[str]]
access_role: Optional[str]

class GenomeRecord(NamedTuple):
id: int
sample_id: int
organism_id: int
sequence_read_set_id: int
access_role: Optional[str]

class MinimalLocationRecord(NamedTuple):
id: int
Expand Down

0 comments on commit f1de9c0

Please sign in to comment.