diff --git a/.github/workflows/test_with_clone.yaml b/.github/workflows/test_with_clone.yaml new file mode 100644 index 00000000..9b12ced9 --- /dev/null +++ b/.github/workflows/test_with_clone.yaml @@ -0,0 +1,179 @@ +--- +name: Test Changes with Cloned DB + +on: + pull_request: + types: [ labeled, synchronize, closed ] + push: + +permissions: + contents: read + +jobs: + + create_clone_and_run_schemachange: + runs-on: ubuntu-latest + if: contains(github.event.pull_request.labels.*.name, 'create_clone_and_run_schemachange') && github.event.pull_request.state == 'open' + environment: dev + env: + SNOWFLAKE_PASSWORD: ${{ secrets.SNOWSQL_PWD }} + SNOWFLAKE_ACCOUNT: ${{ secrets.SNOWSQL_ACCOUNT }} + SNOWFLAKE_USER: ${{ secrets.SNOWSQL_USER }} + SNOWFLAKE_WAREHOUSE: ${{ secrets.SNOWSQL_WAREHOUSE }} + SNOWFLAKE_CLONE_ROLE: DATA_ENGINEER + SNOWFLAKE_SCHEMACHANGE_ROLE: SYSADMIN + SNOWFLAKE_SYNAPSE_DATA_WAREHOUSE_DATABASE_ORIG: ${{ vars.SNOWFLAKE_SYNAPSE_DATA_WAREHOUSE_DATABASE }} + SNOWFLAKE_SYNAPSE_STAGE_STORAGE_INTEGRATION: ${{ vars.SNOWFLAKE_SYNAPSE_STAGE_STORAGE_INTEGRATION }} + SNOWFLAKE_SYNAPSE_STAGE_URL: ${{ vars.SNOWFLAKE_SYNAPSE_STAGE_URL }} + CLONE_NAME: "${{ vars.SNOWFLAKE_SYNAPSE_DATA_WAREHOUSE_DATABASE }}_${{ github.head_ref }}" + STACK: ${{ vars.STACK }} + + steps: + + - uses: actions/checkout@v4 + - uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Install python libraries + shell: bash + run: | + pip install schemachange==3.6.1 + pip install numpy==1.26.4 + pip install pandas==1.5.3 + + - name: Configure Snowflake connections + run: | + # Config file for DATA_ENGINEER + config_file_dpe=$(mktemp) + echo 'default_connection_name = "dpe"' >> $config_file_dpe + echo '[connections.dpe]' >> $config_file_dpe + echo "account = \"${SNOWFLAKE_ACCOUNT}\"" >> $config_file_dpe + echo "user = \"${SNOWFLAKE_USER}\"" >> $config_file_dpe + echo "role = \"${SNOWFLAKE_CLONE_ROLE}\"" >> $config_file_dpe + echo "password = \"${SNOWFLAKE_PASSWORD}\"" >> $config_file_dpe + echo "warehouse = \"${SNOWFLAKE_WAREHOUSE}\"" >> $config_file_dpe + echo 'authenticator = "SNOWFLAKE"' >> $config_file_dpe + + # Config file for SYSADMIN + echo '[connections.sysadmin]' >> $config_file_dpe + echo "account = \"${SNOWFLAKE_ACCOUNT}\"" >> $config_file_dpe + echo "user = \"${SNOWFLAKE_USER}\"" >> $config_file_dpe + echo "role = \"${SNOWFLAKE_SCHEMACHANGE_ROLE}\"" >> $config_file_dpe + echo "password = \"${SNOWFLAKE_PASSWORD}\"" >> $config_file_dpe + echo "warehouse = \"${SNOWFLAKE_WAREHOUSE}\"" >> $config_file_dpe + echo 'authenticator = "SNOWFLAKE"' >> $config_file_dpe + + # Write config paths to environment + echo "SNOWFLAKE_CONFIG_PATH_DPE=$config_file_dpe" >> $GITHUB_ENV + + - name: Install Snowflake CLI with DATA_ENGINEER config + uses: Snowflake-Labs/snowflake-cli-action@v1.5 + with: + default-config-file-path: ${{ env.SNOWFLAKE_CONFIG_PATH_DPE }} + + - name: Verify Snowflake CLI installation and connections + run: | + snow --version + snow connection test -c sysadmin + snow connection test -c dpe + + - name: Sanitize Clone Name + run: | + CLONE_NAME_SANITIZED="${CLONE_NAME//[^a-zA-Z0-9_]/_}" + echo "Clone name has been updated! The clone name will be: ${CLONE_NAME_SANITIZED}" + echo "SNOWFLAKE_SYNAPSE_DATA_WAREHOUSE_DATABASE=${CLONE_NAME_SANITIZED}" >> $GITHUB_ENV + + - name: Zero-copy clone the database + shell: bash + run: | + snow sql -q "CREATE OR REPLACE DATABASE $SNOWFLAKE_SYNAPSE_DATA_WAREHOUSE_DATABASE CLONE $SNOWFLAKE_SYNAPSE_DATA_WAREHOUSE_DATABASE_ORIG;" + + - name: Grant permissions to DATA_ENGINEER on cloned database + shell: bash + run: | + snow connection set-default sysadmin + + # Transfer ownership of: database + snow sql -q "GRANT OWNERSHIP ON DATABASE ${SNOWFLAKE_SYNAPSE_DATA_WAREHOUSE_DATABASE} TO ROLE DATA_ENGINEER REVOKE CURRENT GRANTS;" + + # Transfer ownership of: schemas + snow sql -q "GRANT OWNERSHIP ON SCHEMA ${SNOWFLAKE_SYNAPSE_DATA_WAREHOUSE_DATABASE}.SYNAPSE_RAW TO ROLE DATA_ENGINEER REVOKE CURRENT GRANTS;" + snow sql -q "GRANT OWNERSHIP ON SCHEMA ${SNOWFLAKE_SYNAPSE_DATA_WAREHOUSE_DATABASE}.SYNAPSE TO ROLE DATA_ENGINEER REVOKE CURRENT GRANTS;" + + # Transfer ownership of: tables + snow sql -q "GRANT OWNERSHIP ON ALL TABLES IN SCHEMA ${SNOWFLAKE_SYNAPSE_DATA_WAREHOUSE_DATABASE}.SYNAPSE_RAW TO ROLE DATA_ENGINEER REVOKE CURRENT GRANTS;" + snow sql -q "GRANT OWNERSHIP ON ALL TABLES IN SCHEMA ${SNOWFLAKE_SYNAPSE_DATA_WAREHOUSE_DATABASE}.SYNAPSE TO ROLE DATA_ENGINEER REVOKE CURRENT GRANTS;" + + # Transfer ownership of: dynamic tables + snow sql -q "GRANT OWNERSHIP ON ALL DYNAMIC TABLES IN SCHEMA ${SNOWFLAKE_SYNAPSE_DATA_WAREHOUSE_DATABASE}.SYNAPSE TO ROLE DATA_ENGINEER REVOKE CURRENT GRANTS;" + + - name: Run schemachange on the clone as DATA_ENGINEER + shell: bash + run: | + schemachange \ + -f synapse_data_warehouse \ + -a $SNOWFLAKE_ACCOUNT \ + -u $SNOWFLAKE_USER \ + -r $SNOWFLAKE_CLONE_ROLE \ + -w $SNOWFLAKE_WAREHOUSE \ + --config-folder synapse_data_warehouse + + drop_clone: + runs-on: ubuntu-latest + if: github.event.pull_request.merged == true || github.event.action == 'closed' + environment: dev + env: + SNOWFLAKE_PASSWORD: ${{ secrets.SNOWSQL_PWD }} + SNOWFLAKE_ACCOUNT: ${{ secrets.SNOWSQL_ACCOUNT }} + SNOWFLAKE_USER: ${{ secrets.SNOWSQL_USER }} + SNOWFLAKE_WAREHOUSE: ${{ secrets.SNOWSQL_WAREHOUSE }} + SNOWFLAKE_CLONE_ROLE: DATA_ENGINEER + CLONE_NAME: "${{ vars.SNOWFLAKE_SYNAPSE_DATA_WAREHOUSE_DATABASE }}_${{ github.head_ref }}" + + steps: + + - uses: actions/checkout@v4 + - uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Configure Snowflake connection + run: | + # Create temporary files for config.toml and our private key + config_file=$(mktemp) + + # Write to config.toml file + echo 'default_connection_name = "dpe"' >> $config_file + echo '[connections.dpe]' >> $config_file + echo "account = \"${SNOWFLAKE_ACCOUNT}\"" >> $config_file + echo "user = \"${SNOWFLAKE_USER}\"" >> $config_file + echo "role = \"${SNOWFLAKE_CLONE_ROLE}\"" >> $config_file + echo "password = \"${SNOWFLAKE_PASSWORD}\"" >> $config_file + echo "warehouse = \"${SNOWFLAKE_WAREHOUSE}\"" >> $config_file + echo 'authenticator = "SNOWFLAKE"' >> $config_file + + # Write config.toml path to global environment + echo "SNOWFLAKE_CONFIG_PATH=$config_file" >> $GITHUB_ENV + + - name: Install Snowflake CLI + uses: Snowflake-Labs/snowflake-cli-action@v1.5 + with: + default-config-file-path: ${{ env.SNOWFLAKE_CONFIG_PATH }} + + - name: Verify Snowflake CLI installation and connection + run: | + snow --version + snow connection test + + - name: Sanitize Clone Name + run: | + CLONE_NAME_SANITIZED="${CLONE_NAME//[^a-zA-Z0-9_]/_}" + echo "SNOWFLAKE_SYNAPSE_DATA_WAREHOUSE_DATABASE=${CLONE_NAME_SANITIZED}" >> $GITHUB_ENV + echo "Clone name has been updated! The clone name will be: ${SNOWFLAKE_SYNAPSE_DATA_WAREHOUSE_DATABASE}" + echo $SNOWFLAKE_SYNAPSE_DATA_WAREHOUSE_DATABASE + + - name: Drop the clone + shell: bash + run: | + snow sql -r $SNOWFLAKE_CLONE_ROLE -q "DROP DATABASE IF EXISTS $SNOWFLAKE_SYNAPSE_DATA_WAREHOUSE_DATABASE;" \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..8bd4c456 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,114 @@ +# Contributing Guidelines + +Welcome, and thanks for your interest in contributing to the `snowflake` repository! :snowflake: + +By contributing, you are agreeing that we may redistribute your work under this [license](https://github.com/Sage-Bionetworks/snowflake/tree/snow-90-auto-db-clone?tab=License-1-ov-file#). + +## Development Rules + +There are some things you should make a note of before getting started... + +1. **Avoid Repeatable Scripts Without Introducing Objects Through V Scripts**: + Never use repeatable scripts for tables or any other objects that can potentially be dependencies without first introducing these objects in a V script. This ensures that all dependent objects are properly established in the correct sequence. +2. **Branch Naming Convention**: + If you plan to run the automated testing described in section [Running CI Jobs for Database Testing](#running-ci-jobs-for-database-testing), your branch name needs to start with `snow-`, otherwise the test deployment will fail. + +## Getting Started + +To start contributing, follow these steps to set up and develop on your local repository: + +### 1. Clone the Repository + +```bash +git clone https://github.com/Sage-Bionetworks/snowflake +``` + +### 2. Fetch the Latest `dev` Branch + +After cloning, navigate to the repository directory: + +```bash +cd snowflake +``` + +Then, fetch the latest updates from the `dev` branch to ensure you’re working with the latest codebase: + +```bash +git fetch origin dev +``` + +### 3. Create a New Branch Off `dev` + +Create and checkout your feature branch from the latest `dev` branch. Name it based on the Jira ticket number and your feature/fix. For example: + +```bash +git checkout -b snow-123-new-feature origin/dev +``` + +Your branch will now be tracking `origin/dev` which you can merge with or rebase onto should a merge conflict occur. For more guidance +on how to resolve merge conflicts, [see here](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/addressing-merge-conflicts/about-merge-conflicts#resolving-merge-conflicts). + +### 4. Push to The Remote Branch + +Once you've made your changes and committed them locally, push your branch to the remote repository: + +``` +git push origin snow-123-new-feature +``` + +### 5. Create a Draft Pull Request + +In order to initiate automated testing you will need to work on a draft pull request (PR) on GitHub. After pushing your commits to +the remote branch in Step 4, use the GitHub UI to initate a PR and convert it to draft mode. + +After testing your changes against `schemachange` using the instructions in [Running CI Jobs for Database Testing](#running-ci-jobs-for-database-testing), +you can then take your PR out of draft mode by marking it as Ready for Review in the GitHub UI. + +## Running CI Jobs for Database Testing + +This repository includes automated CI jobs to validate changes against a cloned database. If you want to trigger these jobs to test your changes in an isolated database environment, please follow the steps below: + +### 1. Add the Label + +Add the label `create_clone_and_run_schemachange` to your PR to trigger the CI workflow. This job does two things: + +* Creates a zero-copy clone of the database and runs your proposed schema changes against it. +* Tests your schema changes on a cloned version of the development database, verifying that your updates work correctly without +affecting the real development database. After the PR is merged, the clone is automatically dropped to free up resources. + +> [!IMPORTANT] +> Your cloned database is a clone of the development database as it exists at the time of cloning. Please be mindful that +> **there may have been changes made to the development database since your last clone**. + +> [!NOTE] +> As you are developing on your branch, you may want to re-run the `schemachange` test on your updates. +> You can unlabel and relabel the PR with `create_clone_and_run_schemachange` to re-trigger the job. + +### 2. Perform Inspection using Snowsight + +You can go on Snowsight to perform manual inspection of the schema changes in your cloned database. We recommend using a SQL worksheet for manual quality assurance queries, e.g. ensure there is no row duplication in the new/updated tables. + +> [!TIP] +> Your database will be named after your feature branch so it's easy to find on Snowsight. For example, if your feature branch is called +> `snow-123-new-feature`, your database might be called `SYNAPSE_DATA_WAREHOUSE_DEV_SNOW_123_NEW_FEATURE`. + +### 3. Manually Drop the Cloned Database (Optional) + +There is a second job in the repository (`drop_clone`) that will drop your branch's database clone once it has been merged into `dev`. +In other words, once your cloned database is created for testing, it will remain open until your PR is closed (unless you manually drop it). + +An initial clone of the development database will not incur new resource costs, **HOWEVER**, when a clone deviates from the original +(e.g. new schema changes are applied for testing), the cloned database will begin to incur costs the longer it exists in our warehouse. +**Please be mindful of the amount of time your PR stays open**, as cloned databases do not get dropped until a PR is merged. For example, if your PR is open for >1 week, consider manually dropping your cloned database on Snowflake to avoid unnecessary cost. + +> [!NOTE] +> Keep in mind that after dropping your cloned database, you will still have access to it through Snowflake's "Time Travel" +> feature. Your database is retained through "Time Travel" for X amount of time before it is permanently deleted. To see +> how long your database can be accessed after dropping, run the following query in a SQL worksheet on Snowsight and look +> for the keyword `DATA_RETENTION_TIME_IN_DAYS`: +> +> ``` +> SHOW PARAMETERS IN DATABASE ; +> ``` + +Following these guidelines helps maintain a clean, efficient, and well-tested codebase. Thank you for contributing! diff --git a/README.md b/README.md index 697fc06d..efe88da9 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ pip install "snowflake-connector-python[pandas]" "synapseclient[pandas]" python- ## Contributing -WIP +For contribution guidelines, please see the `CONTRIBUTING.md` file in this repository. ## Visualizing with Streamlit diff --git a/synapse_data_warehouse/synapse/tables/R__my_table.sql b/synapse_data_warehouse/synapse/tables/R__my_table.sql new file mode 100644 index 00000000..a17c93f5 --- /dev/null +++ b/synapse_data_warehouse/synapse/tables/R__my_table.sql @@ -0,0 +1,17 @@ +USE SCHEMA {{database_name}}.synapse; --noqa: JJ01,PRS,TMP + +-- Create the dummy table with example columns +CREATE TABLE IF NOT EXISTS my_table2 ( + id INT, + name STRING, + created_at TIMESTAMP, + value FLOAT +); + +-- Insert arbitrary rows into the dummy table +INSERT INTO my_table2 (id, name, created_at, value) VALUES + (1, 'Alpha', CURRENT_TIMESTAMP, 1000.5), + (2, 'Beta', CURRENT_TIMESTAMP, 20.0), + (3, 'Gamma', CURRENT_TIMESTAMP, 30.75), + (4, 'Delta', CURRENT_TIMESTAMP, 40.1), + (5, 'Epsilon', CURRENT_TIMESTAMP, 50.9); \ No newline at end of file