From 61dbed53fd2a2ee5428f54f9376651fad586edde Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 10 May 2023 11:57:29 -0400 Subject: [PATCH 1/7] add rgfa interface --- src/cactus/cactus_progressive_config.xml | 2 + src/cactus/refmap/cactus_graphmap_join.py | 86 ++++++++++++++++++++--- 2 files changed, 78 insertions(+), 10 deletions(-) diff --git a/src/cactus/cactus_progressive_config.xml b/src/cactus/cactus_progressive_config.xml index d4ce98906..41ea7bf1a 100644 --- a/src/cactus/cactus_progressive_config.xml +++ b/src/cactus/cactus_progressive_config.xml @@ -380,6 +380,7 @@ + diff --git a/src/cactus/refmap/cactus_graphmap_join.py b/src/cactus/refmap/cactus_graphmap_join.py index b383bb626..6517329a2 100644 --- a/src/cactus/refmap/cactus_graphmap_join.py +++ b/src/cactus/refmap/cactus_graphmap_join.py @@ -111,6 +111,8 @@ def graphmap_join_options(parser): parser.add_argument("--filter", type=int, default=2, help = "Generate a frequency filtered graph (from the clipped graph) by removing any sequence present in fewer than this many sequences. Set to 0 to disable. [default=2]") parser.add_argument("--gfa", nargs='*', default=None, help = "Produce a GFA for given graph type(s) if specified. Valid types are 'full', 'clip', and 'filter'. If no type specified 'clip' will be used ('full' used if clipping disabled). Multiple types can be provided separated by a space. [--gfa clip assumed by default]") + + parser.add_argument("--rgfa", nargs='*', default=None, help = "Produce a rGFA cover for given graph type(s) if specified. The rGFA cover will be output as a separate .rgfa.gz file, as well as included in the .gbz is reference path segements. This cover will be based on the (first) reference sample only. Valid types are 'full', 'clip', and 'filter'. If no type specified 'clip' will be used ('full' used if clipping disabled). Multiple types can be provided separated by a space. [--gfa clip assumed by default]") parser.add_argument("--gbz", nargs='*', default=None, help = "Generate GBZ/snarls indexes for the given graph type(s) if specified. Valid types are 'full', 'clip' and 'filter'. If no type specified 'clip' will be used ('full' used if clipping disabled). Multiple types can be provided separated by a space. --giraffe will also produce these (and other) indexes") @@ -174,6 +176,17 @@ def graphmap_join_validate_options(options): if gfa == 'filter' and not options.filter: raise RuntimeError('--gfa cannot be set to filter since filtering is disabled') + if not options.rgfa: + options.rgfa = ['clip'] if options.clip else ['full'] + options.rgfa = list(set(options.rgfa)) + for rgfa in options.rgfa: + if rgfa not in ['clip', 'filter', 'full']: + raise RuntimeError('Unrecognized value for --gfa: {}. Must be one of {clip, filter, full}'.format(gfa)) + if rgfa == 'clip' and not options.clip: + raise RuntimError('--rgfa cannot be set to clip since clipping is disabled') + if rgfa == 'filter' and not options.filter: + raise RuntimeError('--rgfa cannot be set to filter since filtering is disabled') + if options.gbz == []: options.gbz = ['clip'] if options.clip else ['full'] options.gbz = list(set(options.gbz)) if options.gbz else [] @@ -285,10 +298,10 @@ def graphmap_join_validate_options(options): raise RuntimeError('Specifying --haplo {} requires also specifying --giraffe {}'.format(haplo, haplo)) # Prevent some useless compute due to default param combos - if options.clip and 'clip' not in options.gfa + options.gbz + options.odgi + options.chrom_vg + options.chrom_og + options.vcf + options.giraffe + options.viz + options.draw\ - and 'filter' not in options.gfa + options.gbz + options.odgi + options.chrom_vg + options.chrom_og + options.vcf + options.giraffe + options.viz + options.draw: + if options.clip and 'clip' not in options.gfa + options.rgfa + options.gbz + options.odgi + options.chrom_vg + options.chrom_og + options.vcf + options.giraffe + options.viz + options.draw\ + and 'filter' not in options.gfa + options.rgfa + options.gbz + options.odgi + options.chrom_vg + options.chrom_og + options.vcf + options.giraffe + options.viz + options.draw: options.clip = None - if options.filter and 'filter' not in options.gfa + options.gbz + options.odgi + options.chrom_vg + options.chrom_og + options.vcf + options.giraffe + options.viz + options.draw: + if options.filter and 'filter' not in options.gfa + options.rgfa + options.gbz + options.odgi + options.chrom_vg + options.chrom_og + options.vcf + options.giraffe + options.viz + options.draw: options.filter = None return options @@ -459,12 +472,13 @@ def graphmap_join_workflow(job, options, config, vg_ids, hal_ids): gfa_root_job = Job() phase_root_job.addFollowOn(gfa_root_job) gfa_ids = [] + rgfa_ids = [] current_out_dict = None do_gbz = workflow_phase in options.gbz + options.vcf + options.giraffe if do_gbz or workflow_phase in options.gfa: assert len(options.vg) == len(phase_vg_ids) == len(vg_ids) for vg_path, vg_id, input_vg_id in zip(options.vg, phase_vg_ids, vg_ids): - gfa_job = gfa_root_job.addChildJobFn(vg_to_gfa, options, config, vg_path, vg_id, + gfa_job = gfa_root_job.addChildJobFn(vg_to_gfa, options, config, vg_path, vg_id, rgfa=False, disk=input_vg_id.size * 10, memory=max(2**31, input_vg_id.size * 16)) gfa_ids.append(gfa_job.rv()) @@ -477,8 +491,22 @@ def graphmap_join_workflow(job, options, config, vg_ids, hal_ids): memory=index_mem) out_dicts.append(gfa_merge_job.rv()) prev_job = gfa_merge_job - current_out_dict = gfa_merge_job.rv() + current_out_dict = gfa_merge_job.rv() + # optional rgfa + if workflow_phase in options.rgfa: + assert len(options.vg) == len(phase_vg_ids) == len(vg_ids) + for vg_path, vg_id, input_vg_id in zip(options.vg, phase_vg_ids, vg_ids): + rgfa_job = gfa_root_job.addChildJobFn(vg_to_gfa, options, config, vg_path, vg_id, rgfa=True, + disk=input_vg_id.size * 5) + rgfa_ids.append(rgfa_job.rv()) + + rgfa_merge_job = gfa_root_job.addFollowOnJobFn(merge_rgfa, options, config, rgfa_ids, + tag=workflow_phase + '.', + cores=max(1, int(options.indexCores / 4)), + disk=sum(f.size for f in vg_ids) * 3) + out_dicts.append(rgfa_merge_job.rv()) + # optional vcf if workflow_phase in options.vcf: for vcf_ref in options.vcfReference: @@ -599,6 +627,13 @@ def clip_vg(job, options, config, vg_path, vg_id, phase): # todo: do we want to add the minigraph prefix to keep stubs from minigraph? but I don't think it makes stubs.... cmd.append(stub_cmd) + if phase in options.rgfa: + # do the rgfa cover + min_rgfa_fragment = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_join"), "minRGFAFragment", typeFn=int, default=1) + rgfa_cover_cmd = ['vg', 'paths', '-v', '-', '-R', str(min_rgfa_fragment), + '-Q', options.reference[0], '-t', str(job.cores)] + cmd.append(rgfa_cover_cmd) + # enforce chopping if phase == 'full' and options.chop: cmd.append(['vg', 'mod', '-X', str(options.chop), '-']) @@ -721,14 +756,16 @@ def drop_graph_event(job, config, vg_path, full_vg_id): cactus_call(parameters=['vg', 'paths', '-d', '-S', graph_event, '-x', full_vg_path], outfile=out_path) return job.fileStore.writeGlobalFile(out_path) -def vg_to_gfa(job, options, config, vg_path, vg_id): +def vg_to_gfa(job, options, config, vg_path, vg_id, rgfa=False): """ run gfa conversion """ work_dir = job.fileStore.getLocalTempDir() vg_path = os.path.join(work_dir, os.path.basename(vg_path)) job.fileStore.readGlobalFile(vg_id, vg_path) out_path = vg_path + '.gfa' - cmd = ['vg', 'convert', '-f', '-Q', options.reference[0], os.path.basename(vg_path), '-B'] + cmd = ['vg', 'convert', '-f', os.path.basename(vg_path)] + if rgfa: + cmd += ['-Q', options.reference[0], os.path.basename(vg_path), '-B'] cactus_call(parameters=cmd, outfile=out_path, work_dir=work_dir, job_memory=job.memory) @@ -749,6 +786,29 @@ def vg_to_og(job, options, config, vg_path, vg_id): os.path.basename(og_path), '-t', str(job.cores)], work_dir=work_dir, job_memory=job.memory) return job.fileStore.writeGlobalFile(og_path) +def merge_rgfa(job, options, config, rgfa_ids, tag=''): + """ merge the rgfas, pretty much same as gfas below but doesn't + need to be folded into the indexing job as it's not used for anything downstream """ + work_dir = job.fileStore.getLocalTempDir() + + merge_rgfa_path = os.path.join(work_dir, '{}merged.rgfa'.format(tag)) + with open(merge_rgfa_path, 'w') as merge_rgfa_file: + merge_rgfa_file.write('H\tVN:Z:1.0\n') + + # merge the gfas + assert len(options.vg) == len(rgfa_ids) + for i, (vg_path, rgfa_id) in enumerate(zip(options.vg, rgfa_ids)): + rgfa_path = os.path.join(work_dir, os.path.basename(vg_path) + '.rgfa') + job.fileStore.readGlobalFile(rgfa_id, rgfa_path, mutable=True) + # get rid of paths and headers + cmd = [['grep', '-v', '^W', rgfa_path], ['grep', '-v', '^P'], ['grep', '-v', '^H']] + cactus_call(parameters=cmd, outfile=merge_rgfa_path, outappend=True) + job.fileStore.deleteGlobalFile(rgfa_id) + + cactus_call(parameters=['bgzip', merge_rgfa_path, '--threads', str(job.cores)]) + + return { '{}rgfa.gz'.format(tag) : job.fileStore.writeGlobalFile(merge_rgfa_path + '.gz') } + def make_vg_indexes(job, options, config, gfa_ids, tag="", do_gbz=False): """ merge of the gfas, then make gbz / snarls / trans """ @@ -786,9 +846,15 @@ def make_vg_indexes(job, options, config, gfa_ids, tag="", do_gbz=False): gbz_path = os.path.join(work_dir, '{}merged.gbz'.format(tag)) cactus_call(parameters=['vg', 'gbwt', '-G', merge_gfa_path, '--gbz-format', '-g', gbz_path], job_memory=job.memory) - # zip the gfa - cactus_call(parameters=['bgzip', merge_gfa_path, '--threads', str(job.cores)]) - gfa_path = merge_gfa_path + '.gz' + # zip the gfa and remove any rgfa paths (they will be kept in the GBZ and .rgfa + # but not here as it would be just too confusing) + rgfa_sample_name = '_rGFA_' + gfa_path = merge_gfa_path + '.gz' + cactus_call(parameters=[['grep', '-v', '^W\t{}'.format(rgfa_sample_name), merge_gfa_path], + ['bgzip', '--threads', str(job.cores)]], + outfile=gfa_path) + os.remove(merge_gfa_path) + # make the snarls if do_gbz: From f34c56be91172e99cf9214b08a706c4df0a65af2 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 2 Oct 2023 22:37:46 -0400 Subject: [PATCH 2/7] flesh out rgfa interface --- build-tools/downloadPangenomeTools | 3 +- src/cactus/cactus_progressive_config.xml | 2 + src/cactus/refmap/cactus_graphmap_join.py | 123 ++++++++++++++++------ test/evolverTest.py | 31 +++++- 4 files changed, 123 insertions(+), 36 deletions(-) diff --git a/build-tools/downloadPangenomeTools b/build-tools/downloadPangenomeTools index 3ba214552..dbe5907a0 100755 --- a/build-tools/downloadPangenomeTools +++ b/build-tools/downloadPangenomeTools @@ -278,7 +278,8 @@ fi # vg cd ${pangenomeBuildDir} -wget -q https://github.com/vgteam/vg/releases/download/v1.51.0/vg +#wget -q https://github.com/vgteam/vg/releases/download/v1.51.0/vg +wget -q http://public.gi.ucsc.edu/~hickey/vg-patch/vg.30ac0c77e91fbeff06b6e7bc9a7f172607ff84ba -O vg chmod +x vg if [[ $STATIC_CHECK -ne 1 || $(ldd vg | grep so | wc -l) -eq 0 ]] then diff --git a/src/cactus/cactus_progressive_config.xml b/src/cactus/cactus_progressive_config.xml index 41ea7bf1a..a777118eb 100644 --- a/src/cactus/cactus_progressive_config.xml +++ b/src/cactus/cactus_progressive_config.xml @@ -381,6 +381,7 @@ + diff --git a/src/cactus/refmap/cactus_graphmap_join.py b/src/cactus/refmap/cactus_graphmap_join.py index 6517329a2..220d36246 100644 --- a/src/cactus/refmap/cactus_graphmap_join.py +++ b/src/cactus/refmap/cactus_graphmap_join.py @@ -54,7 +54,9 @@ from sonLib.nxnewick import NXNewick from sonLib.bioio import getTempDirectory, getTempFile, catFiles - + +rgfa_sample_name = '_rGFA_' + def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) @@ -112,7 +114,7 @@ def graphmap_join_options(parser): parser.add_argument("--gfa", nargs='*', default=None, help = "Produce a GFA for given graph type(s) if specified. Valid types are 'full', 'clip', and 'filter'. If no type specified 'clip' will be used ('full' used if clipping disabled). Multiple types can be provided separated by a space. [--gfa clip assumed by default]") - parser.add_argument("--rgfa", nargs='*', default=None, help = "Produce a rGFA cover for given graph type(s) if specified. The rGFA cover will be output as a separate .rgfa.gz file, as well as included in the .gbz is reference path segements. This cover will be based on the (first) reference sample only. Valid types are 'full', 'clip', and 'filter'. If no type specified 'clip' will be used ('full' used if clipping disabled). Multiple types can be provided separated by a space. [--gfa clip assumed by default]") + parser.add_argument("--rgfa", nargs='*', default=None, help = "Produce a rGFA cover for given graph type(s) if specified. The rGFA cover will be output as a separate .rgfa.gz file, as well as included in the .gbz is reference path segements. This cover will be based on the (first) reference sample only. Valid types are 'full', 'clip', and 'filter'. If no type specified 'clip' will be used ('full' used if clipping disabled). Multiple types can be provided separated by a space.") parser.add_argument("--gbz", nargs='*', default=None, help = "Generate GBZ/snarls indexes for the given graph type(s) if specified. Valid types are 'full', 'clip' and 'filter'. If no type specified 'clip' will be used ('full' used if clipping disabled). Multiple types can be provided separated by a space. --giraffe will also produce these (and other) indexes") @@ -176,9 +178,9 @@ def graphmap_join_validate_options(options): if gfa == 'filter' and not options.filter: raise RuntimeError('--gfa cannot be set to filter since filtering is disabled') - if not options.rgfa: + if options.rgfa == []: options.rgfa = ['clip'] if options.clip else ['full'] - options.rgfa = list(set(options.rgfa)) + options.rgfa = list(set(options.rgfa)) if options.rgfa else [] for rgfa in options.rgfa: if rgfa not in ['clip', 'filter', 'full']: raise RuntimeError('Unrecognized value for --gfa: {}. Must be one of {clip, filter, full}'.format(gfa)) @@ -383,18 +385,29 @@ def graphmap_join_workflow(job, options, config, vg_ids, hal_ids): full_vg_ids = [join_job.rv(i) for i in range(len(vg_ids))] prev_job = join_job + # by default, we don't leave rGFA path fragments in the chromosome graphs, they will only be in the .gbz and .rgfa.gz output + drop_rgfa_fragments = options.rgfa and not getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_join"), "keepRGFAFragmentsInChromGraphs", typeFn=bool, default=False) + # take out the _MINIGRAPH_ paths if 'full' in options.chrom_vg: output_full_vg_ids = [] for vg_path, vg_id, full_vg_id in zip(options.vg, vg_ids, full_vg_ids): drop_graph_event_job = join_job.addFollowOnJobFn(drop_graph_event, config, vg_path, full_vg_id, disk=vg_id.size * 3, memory=vg_id.size * 6) - output_full_vg_ids.append(drop_graph_event_job.rv()) + if drop_rgfa_fragments: + # take out the rGFA path fragments + drop_rgfa_job = drop_graph_event_job.addFollowOnJobFn(drop_rgfa_path_fragments, vg_path, drop_graph_event_job.rv(), + disk=vg_id.size * 3, memory=vg_id.size & 6) + output_full_vg_ids.append(drop_rgfa_job.rv()) + else: + output_full_vg_ids.append(drop_graph_event_job.rv()) else: output_full_vg_ids = full_vg_ids + # run the "clip" phase to do the clip-vg clipping clip_vg_ids = [] + output_clip_vg_ids = [] clipped_stats = None if options.clip or options.filter: clip_root_job = Job() @@ -411,6 +424,11 @@ def graphmap_join_workflow(job, options, config, vg_ids, hal_ids): clip_og_job = clip_job.addFollowOnJobFn(vg_to_og, options, config, vg_path, clip_job.rv(0), disk=input_vg_id.size * 16, memory=max(og_min_size, input_vg_id.size * 32)) og_chrom_ids['clip']['og'].append(clip_og_job.rv()) + if drop_rgfa_fragments: + drop_rgfa_job = clip_job.addFollowOnJobFn(drop_rgfa_path_fragments, vg_path, clip_job.rv(0), + disk=input_vg_id.size * 3, memory=input_vg_id.size & 6) + output_clip_vg_ids.append(drop_rgfa_job.rv()) + # join the stats clipped_stats = clip_root_job.addFollowOnJobFn(cat_stats, clip_vg_stats).rv() @@ -418,6 +436,7 @@ def graphmap_join_workflow(job, options, config, vg_ids, hal_ids): # run the "filter" phase to do the vg clip clipping filter_vg_ids = [] + output_filter_vg_ids = [] if options.filter: filter_root_job = Job() prev_job.addFollowOn(filter_root_job) @@ -429,8 +448,11 @@ def graphmap_join_workflow(job, options, config, vg_ids, hal_ids): if 'filter' in options.odgi + options.chrom_og + options.viz + options.draw: filter_og_job = filter_job.addFollowOnJobFn(vg_to_og, options, config, vg_path, filter_job.rv(), disk=input_vg_id.size * 16, memory=max(og_min_size, input_vg_id.size * 64)) - og_chrom_ids['filter']['og'].append(filter_og_job.rv()) - + og_chrom_ids['filter']['og'].append(filter_og_job.rv()) + if drop_rgfa_fragments: + drop_rgfa_job = filter_job.addFollowOnJobFn(drop_rgfa_path_fragments, vg_path, filter_job.rv(), + disk=input_vg_id.size * 3, memory=input_vg_id.size & 6) + output_filter_vg_ids.append(drop_rgfa_job.rv()) prev_job = filter_root_job @@ -475,6 +497,7 @@ def graphmap_join_workflow(job, options, config, vg_ids, hal_ids): rgfa_ids = [] current_out_dict = None do_gbz = workflow_phase in options.gbz + options.vcf + options.giraffe + do_rgfa = workflow_phase in options.rgfa if do_gbz or workflow_phase in options.gfa: assert len(options.vg) == len(phase_vg_ids) == len(vg_ids) for vg_path, vg_id, input_vg_id in zip(options.vg, phase_vg_ids, vg_ids): @@ -485,9 +508,9 @@ def graphmap_join_workflow(job, options, config, vg_ids, hal_ids): gfa_merge_job = gfa_root_job.addFollowOnJobFn(make_vg_indexes, options, config, gfa_ids, tag=workflow_phase + '.', - do_gbz=do_gbz, + do_gbz=do_gbz, do_rgfa=do_rgfa, cores=options.indexCores, - disk=sum(f.size for f in vg_ids) * 6, + disk=sum(f.size for f in vg_ids) * 16, memory=index_mem) out_dicts.append(gfa_merge_job.rv()) prev_job = gfa_merge_job @@ -519,6 +542,16 @@ def graphmap_join_workflow(job, options, config, vg_ids, hal_ids): memory=index_mem) out_dicts.append(deconstruct_job.rv()) + if do_rgfa and vcf_ref == options.vcfReference[0]: + rgfa_deconstruct_job = prev_job.addFollowOnJobFn(make_vcf, config, options.outName, vcf_ref, current_out_dict, + max_ref_allele=options.vcfbub, + tag=vcftag + '.', ref_tag = workflow_phase + '.', + do_rgfa=True, + cores=options.indexCores, + disk = sum(f.size for f in vg_ids) * 6, + memory=index_mem) + out_dicts.append(rgfa_deconstruct_job.rv()) + # optional giraffe if workflow_phase in options.giraffe: giraffe_job = gfa_merge_job.addFollowOnJobFn(make_giraffe_indexes, options, config, current_out_dict, @@ -555,7 +588,7 @@ def graphmap_join_workflow(job, options, config, vg_ids, hal_ids): if do_draw: og_chrom_ids[workflow_phase]['draw'].append(viz_job.rv(1) if viz_job else None) - return output_full_vg_ids, clip_vg_ids, clipped_stats, filter_vg_ids, out_dicts, og_chrom_ids + return output_full_vg_ids, output_clip_vg_ids, clipped_stats, output_filter_vg_ids, out_dicts, og_chrom_ids def clip_vg(job, options, config, vg_path, vg_id, phase): """ run clip-vg @@ -755,6 +788,16 @@ def drop_graph_event(job, config, vg_path, full_vg_id): cactus_call(parameters=['vg', 'paths', '-d', '-S', graph_event, '-x', full_vg_path], outfile=out_path) return job.fileStore.writeGlobalFile(out_path) + +def drop_rgfa_path_fragments(job, vg_path, vg_id): + """ drop out rgfa path fragments """ + work_dir = job.fileStore.getLocalTempDir() + vg_path = os.path.join(work_dir, os.path.splitext(os.path.basename(vg_path))[0]) + '.vg' + out_vg_path = os.path.join(work_dir, os.path.splitext(os.path.basename(vg_path))[0]) + 'norgfa.vg' + job.fileStore.readGlobalFile(vg_id, vg_path) + + cactus_call(parameters=['vg', 'paths', '-d', '-S', rgfa_sample_name, '-v', vg_path], outfile=out_vg_path) + return job.fileStore.writeGlobalFile(out_vg_path) def vg_to_gfa(job, options, config, vg_path, vg_id, rgfa=False): """ run gfa conversion """ @@ -809,7 +852,7 @@ def merge_rgfa(job, options, config, rgfa_ids, tag=''): return { '{}rgfa.gz'.format(tag) : job.fileStore.writeGlobalFile(merge_rgfa_path + '.gz') } -def make_vg_indexes(job, options, config, gfa_ids, tag="", do_gbz=False): +def make_vg_indexes(job, options, config, gfa_ids, tag="", do_gbz=False, do_rgfa=False): """ merge of the gfas, then make gbz / snarls / trans """ work_dir = job.fileStore.getLocalTempDir() @@ -825,8 +868,8 @@ def make_vg_indexes(job, options, config, gfa_ids, tag="", do_gbz=False): job.fileStore.readGlobalFile(gfa_id, gfa_path, mutable=True) if i == 0: # make sure every --reference sample is in the GFA header RS tag (which won't be the case if one sample - # is completely missing from the first graph, as vg may filter it out apparently) - cmd = [['head', '-1', gfa_path], ['sed', '-e', '1s/{}//'.format(graph_event)]] + # is completely missing from the first graph, as vg may filter it out apparently) + cmd = [['head', '-1', gfa_path], ['sed', '-e', '1s/ {}//'.format(graph_event), '-e', '1s/{} //'.format(graph_event)]] gfa_header = cactus_call(parameters=cmd, check_output=True).strip().split('\t') for i in range(len(gfa_header)): if gfa_header[i].startswith('RS:Z:'): @@ -841,44 +884,59 @@ def make_vg_indexes(job, options, config, gfa_ids, tag="", do_gbz=False): cactus_call(parameters=cmd, outfile=merge_gfa_path, outappend=True, job_memory=job.memory) job.fileStore.deleteGlobalFile(gfa_id) - # make the gbz + out_dict = {} + + if do_rgfa: + if do_gbz: + # make a GBZ that includes the rGFA fragments + rgfa_gbz_path = os.path.join(work_dir, '{}merged.rgfa.gbz'.format(tag)) + cactus_call(parameters=['vg', 'gbwt', '-G', merge_gfa_path, '--gbz-format', '-g', rgfa_gbz_path], job_memory=job.memory) + out_dict['{}rgfa.gbz'.format(tag)] = job.fileStore.writeGlobalFile(rgfa_gbz_path) + + # remove rGFA paths from the GFA going forward + cactus_call(parameters=[['grep', '-v', '^W\t{}'.format(rgfa_sample_name), merge_gfa_path], + ['sed', '-e', '1s/ {}//g'.format(rgfa_sample_name), '-e', '1s/{} //g'.format(rgfa_sample_name)]], + outfile=merge_gfa_path + '.clean') + cactus_call(parameters=['mv', merge_gfa_path +'.clean', merge_gfa_path]) + + # make the gbz without the rGFA fragments if do_gbz: gbz_path = os.path.join(work_dir, '{}merged.gbz'.format(tag)) cactus_call(parameters=['vg', 'gbwt', '-G', merge_gfa_path, '--gbz-format', '-g', gbz_path], job_memory=job.memory) - - # zip the gfa and remove any rgfa paths (they will be kept in the GBZ and .rgfa - # but not here as it would be just too confusing) - rgfa_sample_name = '_rGFA_' - gfa_path = merge_gfa_path + '.gz' - cactus_call(parameters=[['grep', '-v', '^W\t{}'.format(rgfa_sample_name), merge_gfa_path], - ['bgzip', '--threads', str(job.cores)]], - outfile=gfa_path) - os.remove(merge_gfa_path) - + out_dict['{}gbz'.format(tag)] = job.fileStore.writeGlobalFile(gbz_path) + + # zip the gfa + cactus_call(parameters=['bgzip', '--threads', str(job.cores), merge_gfa_path]) + out_dict['{}gfa.gz'.format(tag)] = job.fileStore.writeGlobalFile(merge_gfa_path + '.gz') # make the snarls if do_gbz: snarls_path = os.path.join(work_dir, '{}merged.snarls'.format(tag)) cactus_call(parameters=['vg', 'snarls', gbz_path, '-T', '-t', str(job.cores)], outfile=snarls_path, job_memory=job.memory) + out_dict['{}snarls'.format(tag)] = job.fileStore.writeGlobalFile(snarls_path) - out_dict = { '{}gfa.gz'.format(tag) : job.fileStore.writeGlobalFile(gfa_path) } if do_gbz: out_dict['{}gbz'.format(tag)] = job.fileStore.writeGlobalFile(gbz_path) - out_dict['{}snarls'.format(tag)] = job.fileStore.writeGlobalFile(snarls_path) + return out_dict -def make_vcf(job, config, out_name, vcf_ref, index_dict, tag='', ref_tag='', max_ref_allele=None): +def make_vcf(job, config, out_name, vcf_ref, index_dict, tag='', ref_tag='', max_ref_allele=None, do_rgfa=False): """ make the vcf """ work_dir = job.fileStore.getLocalTempDir() gbz_path = os.path.join(work_dir, ref_tag + os.path.basename(out_name) + '.gbz') snarls_path = os.path.join(work_dir, ref_tag + os.path.basename(out_name) + '.snarls') - job.fileStore.readGlobalFile(index_dict['{}gbz'.format(ref_tag)], gbz_path) + job.fileStore.readGlobalFile(index_dict['{}{}gbz'.format(ref_tag, 'rgfa.' if do_rgfa else '')], gbz_path) job.fileStore.readGlobalFile(index_dict['{}snarls'.format(ref_tag)], snarls_path) + if do_rgfa: + tag = 'rgfa.{}'.format(tag) + # make the vcf vcf_path = os.path.join(work_dir, '{}merged.vcf.gz'.format(tag)) decon_cmd = ['vg', 'deconstruct', gbz_path, '-P', vcf_ref, '-C', '-a', '-r', snarls_path, '-t', str(job.cores)] + if do_rgfa: + decon_cmd += ['-P', rgfa_sample_name] if getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_join"), "GFANodeIDsInVCF", typeFn=bool, default=True): decon_cmd.append('-O') cactus_call(parameters=[decon_cmd, ['bgzip', '--threads', str(job.cores)]], outfile=vcf_path, job_memory=job.memory) @@ -890,12 +948,15 @@ def make_vcf(job, config, out_name, vcf_ref, index_dict, tag='', ref_tag='', max RealtimeLogger.warning('WARNING: tabix failed on VCF with this error {}'.format(str(e))) tbi_path = None - output_dict = { '{}raw.vcf.gz'.format(tag) : job.fileStore.writeGlobalFile(vcf_path) } + # dont do vcfbub when doing rGFA + out_tag = '{}raw.'.format(tag) if not do_rgfa else tag + + output_dict = { '{}vcf.gz'.format(out_tag) : job.fileStore.writeGlobalFile(vcf_path) } if tbi_path: - output_dict['{}raw.vcf.gz.tbi'.format(tag)] = job.fileStore.writeGlobalFile(tbi_path) + output_dict['{}vcf.gz.tbi'.format(out_tag)] = job.fileStore.writeGlobalFile(tbi_path) # make the filtered vcf - if max_ref_allele: + if max_ref_allele and not do_rgfa: vcfbub_path = os.path.join(work_dir, 'merged.filtered.vcf.gz') cactus_call(parameters=[['vcfbub', '--input', vcf_path, '--max-ref-length', str(max_ref_allele), '--max-level', '0'], ['bgzip', '--threads', str(job.cores)]], diff --git a/test/evolverTest.py b/test/evolverTest.py index c17946580..631345169 100644 --- a/test/evolverTest.py +++ b/test/evolverTest.py @@ -468,7 +468,7 @@ def _run_yeast_pangenome(self, binariesMode): cactus_pangenome_cmd = ['cactus-pangenome', self._job_store(binariesMode), orig_seq_file_path, '--outDir', join_path, '--outName', 'yeast', '--refContigs'] + chroms + ['--reference', 'S288C', 'DBVPG6044', '--vcf', '--vcfReference','DBVPG6044', 'S288C', '--giraffe', 'clip', 'filter', '--chrom-vg', 'clip', 'filter', - '--viz', '--chrom-og', 'clip', 'full', '--odgi', '--haplo', 'clip', + '--viz', '--chrom-og', 'clip', 'full', '--odgi', '--haplo', 'clip', '--rgfa', '--indexCores', '4', '--consCores', '2'] subprocess.check_call(cactus_pangenome_cmd + cactus_opts) @@ -476,7 +476,7 @@ def _run_yeast_pangenome(self, binariesMode): subprocess.check_call(['mkdir', '-p', os.path.join(self.tempDir, 'chroms')]) subprocess.check_call(['mv', os.path.join(join_path, 'chrom-subproblems', 'contig_sizes.tsv'), os.path.join(self.tempDir, 'chroms')]) - def _check_yeast_pangenome(self, binariesMode, other_ref=None, expect_odgi=False, expect_haplo=False): + def _check_yeast_pangenome(self, binariesMode, other_ref=None, expect_odgi=False, expect_haplo=False, expect_rgfa=False): """ yeast pangenome chromosome by chromosome pipeline """ @@ -541,6 +541,23 @@ def _check_yeast_pangenome(self, binariesMode, other_ref=None, expect_odgi=False for haplo_idx in ['yeast.ri', 'yeast.hapl']: idx_bytes = os.path.getsize(os.path.join(join_path, haplo_idx)) self.assertGreaterEqual(idx_bytes, 10000000) + + if expect_rgfa: + # make sure we have the rgfa-related files + for rgfa_idx in ['yeast.rgfa.gz', 'yeast.rgfa.gbz', 'yeast.rgfa.vcf.gz', 'yeast.rgfa.vcf.gz.tbi']: + idx_bytes = os.path.getsize(os.path.join(join_path, haplo_idx)) + if not rgfa_idx.endswith('.tbi'): + self.assertGreaterEqual(idx_bytes, 500000) + + # make sure we have some off-ref nodes in the rgfa file + for rank, expected_count in [(0, 200000), (1, 5000), (2, 100), (3, 9), (4, 2), (5, 1)]: + rank_count = int(subprocess.check_output('gzip -dc {} | grep "SR:i:{}" | wc -l'.format(os.path.join(join_path, 'yeast.rgfa.gz'),rank), shell=True).decode('utf-8')) + self.assertGreaterEqual(rank_count, expected_count) + + # make sure the vcf is bigger + vcf_lines = int(subprocess.check_output('gzip -dc {} | wc -l'.format(vcf_paths[0]), shell=True).decode('utf-8')) + rgfa_vcf_lines = int(subprocess.check_output('gzip -dc {} | wc -l'.format(vcf_paths[0].replace('.vcf','.rgfa.vcf')), shell=True).decode('utf-8')) + self.assertGreaterEqual(rgfa_vcf_lines - vcf_lines, 1000) # make sure the chrom splitting stats are somewhat sane contig_sizes = {} @@ -572,6 +589,9 @@ def _check_yeast_pangenome(self, binariesMode, other_ref=None, expect_odgi=False clip_nodes = int(subprocess.check_output(['vg', 'stats', '-N', os.path.join(join_path, 'yeast.gbz')]).strip().decode('utf-8').strip()) self.assertGreaterEqual(clip_nodes, 400000) self.assertLessEqual(clip_nodes, 500000) + if expect_rgfa: + rgfa_clip_nodes = int(subprocess.check_output(['vg', 'stats', '-N', os.path.join(join_path, 'yeast.rgfa.gbz')]).strip().decode('utf-8').strip()) + self.assertEqual(rgfa_clip_nodes, clip_nodes) filter_nodes = int(subprocess.check_output(['vg', 'stats', '-N', os.path.join(join_path, 'yeast.d2.gbz')]).strip().decode('utf-8').strip()) self.assertGreaterEqual(filter_nodes, 300000) @@ -580,7 +600,10 @@ def _check_yeast_pangenome(self, binariesMode, other_ref=None, expect_odgi=False clip_edges = int(subprocess.check_output(['vg', 'stats', '-E', os.path.join(join_path, 'yeast.gbz')]).strip().decode('utf-8').strip()) self.assertGreaterEqual(clip_edges, 550000) self.assertLessEqual(clip_edges, 650000) - + if expect_rgfa: + rgfa_clip_edges = int(subprocess.check_output(['vg', 'stats', '-E', os.path.join(join_path, 'yeast.rgfa.gbz')]).strip().decode('utf-8').strip()) + self.assertEqual(rgfa_clip_edges, clip_edges) + filter_edges = int(subprocess.check_output(['vg', 'stats', '-E', os.path.join(join_path, 'yeast.d2.gbz')]).strip().decode('utf-8').strip()) self.assertGreaterEqual(filter_edges, 400000) self.assertLessEqual(filter_edges, 500000) @@ -946,7 +969,7 @@ def testYeastPangenomeLocal(self): self._run_yeast_pangenome(name) # check the output - self._check_yeast_pangenome(name, other_ref='DBVPG6044', expect_odgi=True, expect_haplo=True) + self._check_yeast_pangenome(name, other_ref='DBVPG6044', expect_odgi=True, expect_haplo=True, expect_rgfa=True) if __name__ == '__main__': From 018f6180446f86b6ff1f4d65254ef7c05981bb8a Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 5 Oct 2023 16:21:19 -0400 Subject: [PATCH 3/7] update to newer vg --- build-tools/downloadPangenomeTools | 2 +- src/cactus/refmap/cactus_graphmap_join.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/build-tools/downloadPangenomeTools b/build-tools/downloadPangenomeTools index dbe5907a0..092bf9798 100755 --- a/build-tools/downloadPangenomeTools +++ b/build-tools/downloadPangenomeTools @@ -279,7 +279,7 @@ fi # vg cd ${pangenomeBuildDir} #wget -q https://github.com/vgteam/vg/releases/download/v1.51.0/vg -wget -q http://public.gi.ucsc.edu/~hickey/vg-patch/vg.30ac0c77e91fbeff06b6e7bc9a7f172607ff84ba -O vg +wget -q http://public.gi.ucsc.edu/~hickey/vg-patch/vg.aadda02a8056c8120c5fa655b04fbe927f3b8586 -O vg chmod +x vg if [[ $STATIC_CHECK -ne 1 || $(ldd vg | grep so | wc -l) -eq 0 ]] then diff --git a/src/cactus/refmap/cactus_graphmap_join.py b/src/cactus/refmap/cactus_graphmap_join.py index 220d36246..a42724ce3 100644 --- a/src/cactus/refmap/cactus_graphmap_join.py +++ b/src/cactus/refmap/cactus_graphmap_join.py @@ -663,7 +663,7 @@ def clip_vg(job, options, config, vg_path, vg_id, phase): if phase in options.rgfa: # do the rgfa cover min_rgfa_fragment = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_join"), "minRGFAFragment", typeFn=int, default=1) - rgfa_cover_cmd = ['vg', 'paths', '-v', '-', '-R', str(min_rgfa_fragment), + rgfa_cover_cmd = ['vg', 'paths', '-v', '-', '-f', str(min_rgfa_fragment), '-Q', options.reference[0], '-t', str(job.cores)] cmd.append(rgfa_cover_cmd) @@ -796,7 +796,7 @@ def drop_rgfa_path_fragments(job, vg_path, vg_id): out_vg_path = os.path.join(work_dir, os.path.splitext(os.path.basename(vg_path))[0]) + 'norgfa.vg' job.fileStore.readGlobalFile(vg_id, vg_path) - cactus_call(parameters=['vg', 'paths', '-d', '-S', rgfa_sample_name, '-v', vg_path], outfile=out_vg_path) + cactus_call(parameters=['vg', 'paths', '-d', '--rgfa-paths', '-v', vg_path], outfile=out_vg_path) return job.fileStore.writeGlobalFile(out_vg_path) def vg_to_gfa(job, options, config, vg_path, vg_id, rgfa=False): From 1c39998f5abebea60a5a1bf71f255d675f3d008e Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 6 Oct 2023 10:47:19 -0400 Subject: [PATCH 4/7] update vg --- build-tools/downloadPangenomeTools | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build-tools/downloadPangenomeTools b/build-tools/downloadPangenomeTools index 092bf9798..6be297a21 100755 --- a/build-tools/downloadPangenomeTools +++ b/build-tools/downloadPangenomeTools @@ -279,7 +279,7 @@ fi # vg cd ${pangenomeBuildDir} #wget -q https://github.com/vgteam/vg/releases/download/v1.51.0/vg -wget -q http://public.gi.ucsc.edu/~hickey/vg-patch/vg.aadda02a8056c8120c5fa655b04fbe927f3b8586 -O vg +wget -q http://public.gi.ucsc.edu/~hickey/vg-patch/vg.e65bfb2f9ee098c2bd65a854d7d8597048e7c529 -O vg chmod +x vg if [[ $STATIC_CHECK -ne 1 || $(ldd vg | grep so | wc -l) -eq 0 ]] then From ae6985b989aaefef0e80e362fd40922cf709eccf Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 6 Oct 2023 10:56:05 -0400 Subject: [PATCH 5/7] update vg again --- build-tools/downloadPangenomeTools | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build-tools/downloadPangenomeTools b/build-tools/downloadPangenomeTools index 6be297a21..d7885bf00 100755 --- a/build-tools/downloadPangenomeTools +++ b/build-tools/downloadPangenomeTools @@ -279,7 +279,7 @@ fi # vg cd ${pangenomeBuildDir} #wget -q https://github.com/vgteam/vg/releases/download/v1.51.0/vg -wget -q http://public.gi.ucsc.edu/~hickey/vg-patch/vg.e65bfb2f9ee098c2bd65a854d7d8597048e7c529 -O vg +wget -q http://public.gi.ucsc.edu/~hickey/vg-patch/vg.c162ee345210d18e5eb6f3a129da857184e952be -O vg chmod +x vg if [[ $STATIC_CHECK -ne 1 || $(ldd vg | grep so | wc -l) -eq 0 ]] then From 55e27707ecec0935d7d5e15a61d7acd3b1dd61e7 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 6 Oct 2023 16:06:29 -0400 Subject: [PATCH 6/7] 0-haplotypes everywhere --- build-tools/downloadPangenomeTools | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/build-tools/downloadPangenomeTools b/build-tools/downloadPangenomeTools index d7885bf00..4491ba113 100755 --- a/build-tools/downloadPangenomeTools +++ b/build-tools/downloadPangenomeTools @@ -215,7 +215,7 @@ fi # hal2vg cd ${pangenomeBuildDir} -wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.5/hal2vg +wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.4/hal2vg chmod +x hal2vg if [[ $STATIC_CHECK -ne 1 || $(ldd hal2vg | grep so | wc -l) -eq 0 ]] then @@ -225,7 +225,7 @@ else fi # clip-vg cd ${pangenomeBuildDir} -wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.5/clip-vg +wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.4/clip-vg chmod +x clip-vg if [[ $STATIC_CHECK -ne 1 || $(ldd clip-vg | grep so | wc -l) -eq 0 ]] then @@ -235,7 +235,7 @@ else fi # halRemoveDupes cd ${pangenomeBuildDir} -wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.5/halRemoveDupes +wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.4/halRemoveDupes chmod +x halRemoveDupes if [[ $STATIC_CHECK -ne 1 || $(ldd halRemoveDupes | grep so | wc -l) -eq 0 ]] then @@ -245,7 +245,7 @@ else fi # halMergeChroms cd ${pangenomeBuildDir} -wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.5/halMergeChroms +wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.4/halMergeChroms chmod +x halMergeChroms if [[ $STATIC_CHECK -ne 1 || $(ldd halMergeChroms | grep so | wc -l) -eq 0 ]] then @@ -256,7 +256,7 @@ fi # halUnclip cd ${pangenomeBuildDir} -wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.5/halUnclip +wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.4/halUnclip chmod +x halUnclip if [[ $STATIC_CHECK -ne 1 || $(ldd halUnclip | grep so | wc -l) -eq 0 ]] then @@ -267,7 +267,7 @@ fi # filter-paf-deletions cd ${pangenomeBuildDir} -wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.5/filter-paf-deletions +wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.4/filter-paf-deletions chmod +x filter-paf-deletions if [[ $STATIC_CHECK -ne 1 || $(ldd filter-paf-deletions | grep so | wc -l) -eq 0 ]] then @@ -279,7 +279,7 @@ fi # vg cd ${pangenomeBuildDir} #wget -q https://github.com/vgteam/vg/releases/download/v1.51.0/vg -wget -q http://public.gi.ucsc.edu/~hickey/vg-patch/vg.c162ee345210d18e5eb6f3a129da857184e952be -O vg +wget -q http://public.gi.ucsc.edu/~hickey/vg-patch/vg.9df2a056197cafd817cf48c76cf662dd775d265d -O vg chmod +x vg if [[ $STATIC_CHECK -ne 1 || $(ldd vg | grep so | wc -l) -eq 0 ]] then From 1ba988f83d4e373fb9c8619966f7edbb70d45c2c Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 21 Nov 2023 16:23:14 -0500 Subject: [PATCH 7/7] fix rgfa/vcf option toggle--rgfa only works on backbone --- src/cactus/refmap/cactus_graphmap_join.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cactus/refmap/cactus_graphmap_join.py b/src/cactus/refmap/cactus_graphmap_join.py index 8a6b4e78d..ce91e5582 100644 --- a/src/cactus/refmap/cactus_graphmap_join.py +++ b/src/cactus/refmap/cactus_graphmap_join.py @@ -542,7 +542,7 @@ def graphmap_join_workflow(job, options, config, vg_ids, hal_ids): memory=index_mem) out_dicts.append(deconstruct_job.rv()) - if do_rgfa and vcf_ref == options.vcfReference[0]: + if do_rgfa and vcf_ref == options.reference[0]: rgfa_deconstruct_job = prev_job.addFollowOnJobFn(make_vcf, config, options.outName, vcf_ref, current_out_dict, max_ref_allele=options.vcfbub, tag=vcftag + '.', ref_tag = workflow_phase + '.',