neurobooth · sid4py · Jan 10, 2025 · Jan 10, 2025 · Jan 10, 2025 · Jan 10, 2025
diff --git a/neurobooth_terra/views/sql/rc_demographic_clean.sql b/neurobooth_terra/views/sql/rc_demographic_clean.sql
@@ -13,7 +13,7 @@ SELECT
     dem.redcap_event_name,
     (regexp_match(REPLACE(dem.redcap_event_name, 'enrollment', 'v1'), 'v(\d+)_arm_\d+'))[1]::int AS redcap_sequence_num,
 	(regexp_match(REPLACE(dem.redcap_event_name, 'enrollment', 'v1'), 'v\d+_arm_(\d+)'))[1]::int AS redcap_study_arm,
-    pinfo.test_subject_boolean,
+    bdata.test_subject_boolean,
     CASE
         WHEN dem.demographic_complete = 2 THEN TRUE
         ELSE FALSE
@@ -152,6 +152,8 @@ SELECT
 FROM rc_demographic dem
 RIGHT OUTER JOIN rc_participant_and_consent_information pinfo
     ON dem.subject_id = pinfo.subject_id  -- Should by a many-to-one join
+RIGHT OUTER JOIN rc_baseline_data bdata
+    ON dem.subject_id = bdata.subject_id
 LEFT OUTER JOIN subject subj
     ON pinfo.subject_id = subj.subject_id
 ORDER BY

diff --git a/neurobooth_terra/views/sql/v_longitudinal_summary.sql b/neurobooth_terra/views/sql/v_longitudinal_summary.sql
@@ -27,7 +27,7 @@ WITH latest_diagnosis AS (
 		MAX(visit.neurobooth_visit_dates) AS last_visit,
 		MAX(visit.neurobooth_visit_dates) - MIN(visit.neurobooth_visit_dates) AS total_days
 	FROM rc_visit_dates visit
-	JOIN rc_participant_and_consent_information subj_info
+	JOIN rc_baseline_data subj_info
 		ON visit.subject_id = subj_info.subject_id
 	WHERE subj_info.test_subject_boolean = FALSE
 	GROUP BY

diff --git a/scripts/redcap_metadata_to_postgres.py b/scripts/redcap_metadata_to_postgres.py
@@ -30,7 +30,8 @@
 from neurobooth_terra.views.views import create_views, drop_views
 from config import ssh_args, db_args, project
 
-# %%
+
+# ------
 # Let us first define the surveys and their survey IDs that we want to fetch.
 # This information can be found on Redcap. To fetch Redcap data, you will
 # also need to define the NEUROBOOTH_REDCAP_TOKEN environment variable.
@@ -80,17 +81,10 @@
     'pgic_followup_visits': 175964, # patient_global_impression_of_change_since_last_time_point
     'cortical_basal_ganglia_functional_scale': 175965,
     'psp_staging': 133558,
+    'baseline_data': 184345,
 }
+# ------
 
-# TODOs
-# table column mapping
-# email regardless of error/warning
-
-# %%
-# Next, we fetch the metadata table. This table is the master table
-# that contains columns and their information. It can be used to infer
-# information about the columns: example, what choices are available for a
-# particular question.
 
 def _correct_response_array(metadata_df):
     '''Correct response array values for field_name
@@ -133,6 +127,12 @@ def _correct_response_array(metadata_df):
 
     return metadata_df
 
+
+# ------
+# Next, we fetch the metadata table. This table is the master table
+# that contains columns and their information. It can be used to infer
+# information about the columns: example, what choices are available for a
+# particular question.
 print('Fetching metadata ...')
 metadata = project.export_metadata(format='df')
 # make any correction to redcap data dictionary as soon as it is received from redcap
@@ -143,11 +143,10 @@ def _correct_response_array(metadata_df):
                    'text_validation_type_or_show_slider_number']
 metadata = metadata[metadata_fields]
 metadata.to_csv('data_dictionary.csv')
-# print(metadata.loc['prom_ataxia_54', 'select_choices_or_calculations'])
+
 print('[Done]')
+# ------
 
-# metadata = metadata[metadata.redcap_form_name.isin(
-#    ['subject', 'participant_and_consent_information', 'demograph'])]
 
 for column in ['section_header', 'field_label']:
     metadata[column] = metadata[column].apply(
@@ -207,12 +206,28 @@ def _correct_response_array(metadata_df):
 
         # for table_id, table_info in table_infos.items():
         for table_id, _ in survey_ids.items():
+
             table_info = table_infos[table_id]
             print(f'Overwriting table {table_id}')
             drop_table('rc_' + table_id, conn)
+
+            # ------
+            # As we re-consent participants, consent tables such as 'consent_nih_sca'
+            # and 'participant_and_consent_information' get duplicate rows for
+            # the same subject_id with incremented redcap_repeat_instance column integers.
+            # This leads to a duplicate primary key error as primary keys [subject_id,
+            # redcap_event_name] are the same across these rows.
+            # 
+            # To resolve this, the redcap_repeat_instance column is also added as a primary key
+            primary_keys = ['subject_id', 'redcap_event_name']
+            consent_tables = ['participant_and_consent_information', 'consent_nih_sca']
+            if table_id in consent_tables:
+                primary_keys = ['subject_id', 'redcap_event_name', 'redcap_repeat_instance']
+            # ------
+
             table = create_table('rc_' + table_id, conn, table_info['columns']+table_info['indicator_columns'],
                                  table_info['dtypes']+(['smallint[]']*len(table_info['indicator_columns'])),
-                                 primary_key=['subject_id', 'redcap_event_name'])
+                                 primary_key=primary_keys)
             df = fetch_survey(project, survey_name=table_id,
                               survey_id=survey_ids[table_id])
             df = df.rename(columns={'record_id': 'subject_id'})
@@ -224,6 +239,13 @@ def _correct_response_array(metadata_df):
                 warn(f'Skipping {table_id} because of missing complete col')
                 continue
             df = df[df[complete_col[0]] == 2]
+            # For the consent tables we only want to keep rows where
+            # redcap_repeat_instance column has non-null values
+            # Additional context: the redcap_repeat_instance column is
+            # autogenerated by REDCap and rows with null values are 
+            # generated by REDCap for some reason
+            if table_id in consent_tables:
+                df = df[df['redcap_repeat_instance'] >= 1]
 
             report_cols = set([col.split('___')[0] for col in df.columns])
             extra_cols = report_cols - (set(table_info['columns']) |