-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[FEATURE] Creating a new command to export database data to CSV.
Issue #4561
- Loading branch information
1 parent
b11e4eb
commit ccb2375
Showing
4 changed files
with
189 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
--- | ||
name: Export Data to CSV | ||
on: | ||
# schedule: | ||
# Monthly, on the 5th, at 8am UTC (3am EST) | ||
# - cron: '0 8 5 * *' | ||
workflow_dispatch: | ||
inputs: | ||
environment: | ||
required: true | ||
type: choice | ||
description: The environment the workflow should run on. | ||
options: | ||
- dev | ||
- staging | ||
- preview | ||
- production | ||
|
||
jobs: | ||
scheduled-data-export: | ||
if: ${{ github.event_name == 'schedule' }} | ||
strategy: | ||
matrix: | ||
environments: ["production"] # For now, just do the scheduled job on production to save space | ||
name: Run data export on ${{ inputs.environment }} | ||
runs-on: ubuntu-latest | ||
environment: ${{ matrix.environments }} | ||
env: | ||
space: ${{ matrix.environments }} | ||
steps: | ||
- name: Run Command | ||
uses: cloud-gov/cg-cli-tools@main | ||
with: | ||
cf_username: ${{ secrets.CF_USERNAME }} | ||
cf_password: ${{ secrets.CF_PASSWORD }} | ||
cf_org: gsa-tts-oros-fac | ||
cf_space: ${{ env.space }} | ||
command: cf run-task gsa-fac -k 2G -m 2G --name export_data_to_csv --command "python manage.py export_data" | ||
|
||
dispatch-data-export: | ||
if: ${{ github.event.inputs.environment != '' }} | ||
name: Run data export on ${{ inputs.environment }} | ||
runs-on: ubuntu-latest | ||
environment: ${{ inputs.environment }} | ||
env: | ||
space: ${{ inputs.environment }} | ||
steps: | ||
- name: Run Command | ||
uses: cloud-gov/cg-cli-tools@main | ||
with: | ||
cf_username: ${{ secrets.CF_USERNAME }} | ||
cf_password: ${{ secrets.CF_PASSWORD }} | ||
cf_org: gsa-tts-oros-fac | ||
cf_space: ${{ env.space }} | ||
command: cf run-task gsa-fac -k 2G -m 2G --name export_data_to_csv --command "python manage.py export_data" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
import os | ||
import logging | ||
|
||
from config import settings | ||
from datetime import datetime | ||
from django.core.management.base import BaseCommand, CommandError | ||
from sling import Replication, ReplicationStream | ||
|
||
from support.decorators import newrelic_timing_metric | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
S3_CONNECTION = f"""{{ | ||
"type": "s3", | ||
"bucket": "export", | ||
"access_key_id": "{settings.AWS_PRIVATE_ACCESS_KEY_ID}", | ||
"secret_access_key": "{settings.AWS_PRIVATE_SECRET_ACCESS_KEY}", | ||
"endpoint": "{settings.AWS_S3_ENDPOINT_URL}" | ||
}} | ||
""" | ||
DB_URL = os.environ.get("DATABASE_URL") | ||
FAC_DB_URL = ( | ||
f"{DB_URL}?sslmode=disable" if settings.ENVIRONMENT in ["LOCAL", "TEST"] else DB_URL | ||
) | ||
DEFAULT_OPTIONS = { | ||
"target_options": { | ||
"format": "csv", | ||
"compression": "gzip", | ||
"file_max_rows": 0, | ||
} | ||
} | ||
|
||
|
||
class StreamGenerator: | ||
COMMON_QUERY = ( | ||
"select * from {table_name} where report_id in (" | ||
" select dg.report_id from public.dissemination_general dg" | ||
" where dg.is_public = 'true'" | ||
" and dg.audit_year = '{audit_year}' )" | ||
) | ||
|
||
def __init__(self, table_name, friendly_name, query=COMMON_QUERY): | ||
self.table_name = table_name | ||
self.friendly_name = friendly_name | ||
self.query = query | ||
|
||
def generate_stream(self, audit_year): | ||
return ( | ||
f"{self.table_name}.{audit_year}", | ||
ReplicationStream( | ||
object=f"bulk_export/{{MM}}/{audit_year}_{self.friendly_name}.csv", | ||
sql=self.query.format( | ||
table_name=self.table_name, audit_year=audit_year | ||
), | ||
mode="full-refresh", | ||
target_options={"format": "csv"}, | ||
), | ||
) | ||
|
||
|
||
STREAM_GENERATORS = [ | ||
StreamGenerator( | ||
friendly_name="General", | ||
table_name="dissemination_general", | ||
query="select * from dissemination_general dg" | ||
" where dg.is_public = 'true' and" | ||
" dg.audit_year = '{audit_year}'", | ||
), | ||
StreamGenerator( | ||
friendly_name="AdditionalEIN", table_name="dissemination_additionalein" | ||
), | ||
StreamGenerator( | ||
friendly_name="AdditionalUEI", table_name="dissemination_additionaluei" | ||
), | ||
StreamGenerator(friendly_name="Combined", table_name="dissemination_combined"), | ||
StreamGenerator( | ||
friendly_name="CorrectiveActionPlans", table_name="dissemination_captext" | ||
), | ||
StreamGenerator( | ||
friendly_name="FederalAward", table_name="dissemination_federalaward" | ||
), | ||
StreamGenerator(friendly_name="Finding", table_name="dissemination_finding"), | ||
StreamGenerator( | ||
friendly_name="FindingText", table_name="dissemination_findingtext" | ||
), | ||
StreamGenerator(friendly_name="Note", table_name="dissemination_note"), | ||
StreamGenerator( | ||
friendly_name="PassThrough", table_name="dissemination_passthrough" | ||
), | ||
StreamGenerator( | ||
friendly_name="SecondaryAuditor", table_name="dissemination_secondaryauditor" | ||
), | ||
] | ||
|
||
|
||
@newrelic_timing_metric("data_export") | ||
def _run_data_export(): | ||
logger.info("Begin exporting data") | ||
# We may want to consider instead of hardcoding 2016 only export the past X years. | ||
# This will only export data that exists, so doing +2 just incase some data is in early | ||
years = range(2016, datetime.today().year + 2) | ||
streams = {} | ||
for stream_generator in STREAM_GENERATORS: | ||
for year in years: | ||
streams.update([stream_generator.generate_stream(year)]) | ||
|
||
replication = Replication( | ||
source="FAC_DB", | ||
target="BULK_DATA_EXPORT", | ||
streams=streams, | ||
defaults=DEFAULT_OPTIONS, | ||
env=dict(FAC_DB=FAC_DB_URL, BULK_DATA_EXPORT=S3_CONNECTION), | ||
) | ||
logger.info(f"Exporting {len(streams)} streams") | ||
replication.run() | ||
logger.info("Successfully exported data") | ||
|
||
|
||
class Command(BaseCommand): | ||
def handle(self, *args, **kwargs): | ||
try: | ||
_run_data_export() | ||
except Exception as ex: | ||
logger.error("An error occurred while exporting data", exc_info=ex) | ||
raise CommandError("Error while exporting data") |