From 0b4371092f273508e5915bc323fafffa43eb28c5 Mon Sep 17 00:00:00 2001 From: Tim Mower Date: Mon, 16 Dec 2024 08:00:24 -0800 Subject: [PATCH] Add uses bulkdata argument to paasta spark run This makes the change to paasta spark run so that https://github.yelpcorp.com/sysgit/yelpsoa-configs/pull/52010 will work as expected I'm not checking here if the /nail/bulkdata volume is specified in the spark config, e.g `spark.kubernetes.executor.volumes.hostPath.0.mount.path=/nail/bulkdata` - doing this and setting uses_bulkdata set to True would result in multiple docker volumes being set which would cause a failure. This follows on from [this conversation in slack](https://yelp.slack.com/archives/CA8BWU65D/p1729768030212919) and will allow us to complete [this project](https://yelpwiki.yelpcorp.com/display/PRODENG/Project+Incredible+Bulk) --- paasta_tools/cli/cmds/spark_run.py | 10 ++++++++++ tests/cli/test_cmds_spark_run.py | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/paasta_tools/cli/cmds/spark_run.py b/paasta_tools/cli/cmds/spark_run.py index 3b9300ebbf..4655608058 100644 --- a/paasta_tools/cli/cmds/spark_run.py +++ b/paasta_tools/cli/cmds/spark_run.py @@ -350,6 +350,13 @@ def add_subparser(subparsers): default=False, ) + list_parser.add_argument( + "--uses-bulkdata", + help="Mount /nail/bulkdata in the container", + action="store_true", + default=False, + ) + aws_group = list_parser.add_argument_group( title="AWS credentials options", description="If --aws-credentials-yaml is specified, it overrides all " @@ -785,6 +792,9 @@ def configure_and_run_docker_container( else: raise UnsupportedClusterManagerException(cluster_manager) + if args.uses_bulkdata: + volumes.append("/nail/bulkdata:/nail/bulkdata:ro") + volumes.append("%s:rw" % args.work_dir) volumes.append("/nail/home:/nail/home:rw") diff --git a/tests/cli/test_cmds_spark_run.py b/tests/cli/test_cmds_spark_run.py index 022b707b8b..19eaf02de4 100644 --- a/tests/cli/test_cmds_spark_run.py +++ b/tests/cli/test_cmds_spark_run.py @@ -429,6 +429,7 @@ class TestConfigureAndRunDockerContainer: "fake_dir", ) + @pytest.mark.parametrize("uses_bulkdata", [True, False]) @pytest.mark.parametrize( ["cluster_manager", "spark_args_volumes", "expected_volumes"], [ @@ -468,6 +469,7 @@ def test_configure_and_run_docker_container( cluster_manager, spark_args_volumes, expected_volumes, + uses_bulkdata, ): mock_get_username.return_value = "fake_user" spark_conf = { @@ -494,6 +496,7 @@ def test_configure_and_run_docker_container( args.tronfig = None args.job_id = None args.use_service_auth_token = False + args.uses_bulkdata = uses_bulkdata with mock.patch.object( self.instance_config, "get_env_dictionary", return_value={"env1": "val1"} ): @@ -512,10 +515,15 @@ def test_configure_and_run_docker_container( spark_config_dict=spark_conf, is_mrjob=args.mrjob, ) + if uses_bulkdata: + bullkdata_volumes = ["/nail/bulkdata:/nail/bulkdata:ro"] + else: + bullkdata_volumes = [] mock_run_docker_container.assert_called_once_with( container_name="fake_app", volumes=( expected_volumes + + bullkdata_volumes + [ "/fake_dir:/spark_driver:rw", "/nail/home:/nail/home:rw", @@ -609,6 +617,7 @@ def test_configure_and_run_docker_driver_resource_limits_config( args.docker_memory_limit = "4g" args.docker_shm_size = "1g" args.use_service_auth_token = False + args.uses_bulkdata = False with mock.patch.object( self.instance_config, "get_env_dictionary", return_value={"env1": "val1"} ): @@ -724,6 +733,7 @@ def test_configure_and_run_docker_driver_resource_limits( args.docker_memory_limit = False args.docker_shm_size = False args.use_service_auth_token = False + args.uses_bulkdata = False with mock.patch.object( self.instance_config, "get_env_dictionary", return_value={"env1": "val1"} ):