Skip to content

Commit

Permalink
Import upstream htslib/samtools/bcftools 1.21
Browse files Browse the repository at this point in the history
For each package:

rm -rf htslib;   python3 devtools/import.py htslib   .../htslib-1.21
rm -rf samtools; python3 devtools/import.py samtools .../samtools-1.21
rm -rf bcftools; python3 devtools/import.py bcftools .../bcftools-1.21

Take care to preserve the #define additions to bcftools/regidx.h.
(Don't bother adding {sam,bcf}tools/htslib-1.21/{LICENSE,**/README}.)

Also take care to preserve removing the regeneration of htscodecs.mk
from htslib/Makefile.
  • Loading branch information
jmarshall committed Oct 6, 2024
1 parent b1e87ef commit d3d2733
Show file tree
Hide file tree
Showing 225 changed files with 30,089 additions and 9,982 deletions.
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ as it resolves non-python dependencies and uses pre-configured
compilation options. Especially for OS X this will potentially save a
lot of trouble.

The current version of pysam wraps 3rd-party code from htslib-1.18, samtools-1.18, and bcftools-1.18.
The current version of pysam wraps 3rd-party code from htslib-1.21, samtools-1.21, and bcftools-1.21.

Pysam is available through `pypi
<https://pypi.python.org/pypi/pysam>`_. To install, type::
Expand Down
2 changes: 1 addition & 1 deletion bcftools/HMM.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ double *hmm_get_fwd_bwd_prob(hmm_t *hmm);
* @sites: list of positions
*
* Same as hmm_run_fwd_bwd, in addition a pointer to a matrix with the new
* transition probabilities is returned. In this verison, emission
* transition probabilities is returned. In this version, emission
* probabilities are not updated.
*/
double *hmm_run_baum_welch(hmm_t *hmm, int nsites, double *eprob, uint32_t *sites);
Expand Down
27 changes: 26 additions & 1 deletion bcftools/LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ the INSTALL document), the use of this software is governed by the GPL license.

The MIT/Expat License

Copyright (C) 2012-2023 Genome Research Ltd.
Copyright (C) 2012-2024 Genome Research Ltd.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -772,3 +772,28 @@ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-----------------------------------------------------------------------------

License for edlib.[ch]

The MIT License (MIT)

Copyright (c) 2014 Martin Šošić

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
66 changes: 56 additions & 10 deletions bcftools/abuf.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* The MIT License
Copyright (c) 2021-2023 Genome Research Ltd.
Copyright (c) 2021-2024 Genome Research Ltd.
Author: Petr Danecek <[email protected]>
Expand Down Expand Up @@ -43,6 +43,7 @@ typedef struct
kstring_t ref, alt;
int ial; // the index of the original ALT allele, 1-based
int beg, end; // 0-based inclusive offsets to ref,alt
int plen; // the ref,alt prefix length, eg plen=1 for C>CA
}
atom_t;

Expand Down Expand Up @@ -175,8 +176,9 @@ static void _atomize_allele(abuf_t *buf, bcf1_t *rec, int ial)
atom->alt.l = 0;
kputc(refb, &atom->ref);
kputc(refb, &atom->alt);
atom->beg = atom->end = i;
atom->ial = ial;
atom->beg = atom->end = i;
atom->ial = ial;
atom->plen = 1;
}
continue;
}
Expand All @@ -202,6 +204,35 @@ static int _atoms_inconsistent(const atom_t *a, const atom_t *b)
if ( rcmp ) return rcmp;
return strcasecmp(a->alt.s,b->alt.s);
}

// returns
// 0 .. identical beg,ref,alt
// 1 .. non-overlapping variants, but record may overlap (A>AT vs A>C)
// 2 .. overlapping (conflicting) variants
static int _atoms_overlap(const atom_t *a, const atom_t *b)
{
if ( a->beg < b->beg ) return 2;
if ( a->beg > b->beg ) return 2;

// consider SNV followed by DEL as not overlapping
// CC > C a.plen=1 (ref,alt prefix len=1)
// C > T b.plen=0 (ref,alt prefix len=0)
if ( a->plen && a->plen >= b->ref.l ) return 1;
if ( b->plen && b->plen >= a->ref.l ) return 1;

int rcmp = strcasecmp(a->ref.s,b->ref.s);
if ( rcmp ) return 2;

// consider SNV followed by INS as not overlapping
// A > AT a.plen=1 (ref,alt prefix len=1)
// A > C b.plen=0 (ref,alt prefix len=0)
if ( a->plen && a->plen >= b->alt.l ) return 1;
if ( b->plen && b->plen >= a->alt.l ) return 1;

rcmp = strcasecmp(a->alt.s,b->alt.s);
if ( rcmp ) return 2;
return 0;
}
/*
For reproducibility of tests on different platforms, we need to guarantee the same order of identical
atoms originating from different source ALTs. Even though they are consistent, different values can be
Expand Down Expand Up @@ -238,7 +269,14 @@ static void _split_table_new(abuf_t *buf, atom_t *atom)
static void _split_table_overlap(abuf_t *buf, int iout, atom_t *atom)
{
uint8_t *ptr = buf->split.tbl + iout*buf->split.nori;
ptr[atom->ial-1] = _atoms_inconsistent(atom,buf->split.atoms[iout]) ? 2 : 1;
int olap = _atoms_overlap(atom,buf->split.atoms[iout]);
ptr[atom->ial-1] = olap > 1 ? 2 : 1;

// The test test/atomize.split.5.vcf shows why we sometimes can and sometimes
// cannot remove the star allele like this
// buf->split.overlaps[iout] = olap > 1 ? 1 : 0;
// I forgot the details of the code, so don't immediately see
// if this could be made smarter
buf->split.overlaps[iout] = 1;
}
#if 0
Expand Down Expand Up @@ -411,13 +449,21 @@ static void _split_table_set_info(abuf_t *buf, bcf_info_t *info, merge_rule_t mo
buf->tmp2 = dst.s;
ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, dst.l, type);
}
if ( ret!=0 ) error("An error occurred while updating INFO/%s\n",tag);
if ( ret!=0 ) error("An error occurred while updating INFO/%s (errcode=%d)\n",tag,ret);
}
}
static void _split_table_set_history(abuf_t *buf)
{
int i,j;
int i,j,ret;
bcf1_t *rec = buf->split.rec;

// Don't update if the tag already exists. This is to prevent -a from overwriting -m
int m = 0;
char *tmp = NULL;
ret = bcf_get_info_string(buf->hdr,rec,buf->split.info_tag,&tmp,&m);
free(tmp);
if ( ret>0 ) return;

buf->tmps.l = 0;
ksprintf(&buf->tmps,"%s|%"PRIhts_pos"|%s|",bcf_seqname(buf->hdr,rec),rec->pos+1,rec->d.allele[0]);
for (i=1; i<rec->n_allele; i++)
Expand All @@ -441,8 +487,8 @@ static void _split_table_set_history(abuf_t *buf)
kputc(',',&buf->tmps);
}
buf->tmps.s[--buf->tmps.l] = 0;
if ( (bcf_update_info_string(buf->out_hdr, out, buf->split.info_tag, buf->tmps.s))!=0 )
error("An error occurred while updating INFO/%s\n",buf->split.info_tag);
if ( (ret=bcf_update_info_string(buf->out_hdr, out, buf->split.info_tag, buf->tmps.s))!=0 )
error("An error occurred while updating INFO/%s (errcode=%d)\n",buf->split.info_tag,ret);
}
}
static void _split_table_set_gt(abuf_t *buf)
Expand Down Expand Up @@ -668,7 +714,7 @@ static void _split_table_set_format(abuf_t *buf, bcf_fmt_t *fmt, merge_rule_t mo
#undef BRANCH
ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, 3*(1+star_allele)*nsmpl, type);
}
if ( ret!=0 ) error("An error occurred while updating FORMAT/%s\n",tag);
if ( ret!=0 ) error("An error occurred while updating FORMAT/%s (errcode=%d)\n",tag,ret);
}
}
static inline int _is_acgtn(char *seq)
Expand Down Expand Up @@ -737,7 +783,7 @@ void _abuf_split(abuf_t *buf, bcf1_t *rec)
_split_table_init(buf,rec,buf->natoms);
for (i=0; i<buf->natoms; i++)
{
if ( i && !_atoms_inconsistent(&buf->atoms[i-1],&buf->atoms[i]) ) continue;
if ( i && _atoms_inconsistent(&buf->atoms[i-1],&buf->atoms[i])==0 ) continue;
_split_table_new(buf, &buf->atoms[i]); // add a new unique output atom
}
for (i=0; i<buf->natoms; i++)
Expand Down
66 changes: 56 additions & 10 deletions bcftools/abuf.c.pysam.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

/* The MIT License
Copyright (c) 2021-2023 Genome Research Ltd.
Copyright (c) 2021-2024 Genome Research Ltd.
Author: Petr Danecek <[email protected]>
Expand Down Expand Up @@ -45,6 +45,7 @@ typedef struct
kstring_t ref, alt;
int ial; // the index of the original ALT allele, 1-based
int beg, end; // 0-based inclusive offsets to ref,alt
int plen; // the ref,alt prefix length, eg plen=1 for C>CA
}
atom_t;

Expand Down Expand Up @@ -177,8 +178,9 @@ static void _atomize_allele(abuf_t *buf, bcf1_t *rec, int ial)
atom->alt.l = 0;
kputc(refb, &atom->ref);
kputc(refb, &atom->alt);
atom->beg = atom->end = i;
atom->ial = ial;
atom->beg = atom->end = i;
atom->ial = ial;
atom->plen = 1;
}
continue;
}
Expand All @@ -204,6 +206,35 @@ static int _atoms_inconsistent(const atom_t *a, const atom_t *b)
if ( rcmp ) return rcmp;
return strcasecmp(a->alt.s,b->alt.s);
}

// returns
// 0 .. identical beg,ref,alt
// 1 .. non-overlapping variants, but record may overlap (A>AT vs A>C)
// 2 .. overlapping (conflicting) variants
static int _atoms_overlap(const atom_t *a, const atom_t *b)
{
if ( a->beg < b->beg ) return 2;
if ( a->beg > b->beg ) return 2;

// consider SNV followed by DEL as not overlapping
// CC > C a.plen=1 (ref,alt prefix len=1)
// C > T b.plen=0 (ref,alt prefix len=0)
if ( a->plen && a->plen >= b->ref.l ) return 1;
if ( b->plen && b->plen >= a->ref.l ) return 1;

int rcmp = strcasecmp(a->ref.s,b->ref.s);
if ( rcmp ) return 2;

// consider SNV followed by INS as not overlapping
// A > AT a.plen=1 (ref,alt prefix len=1)
// A > C b.plen=0 (ref,alt prefix len=0)
if ( a->plen && a->plen >= b->alt.l ) return 1;
if ( b->plen && b->plen >= a->alt.l ) return 1;

rcmp = strcasecmp(a->alt.s,b->alt.s);
if ( rcmp ) return 2;
return 0;
}
/*
For reproducibility of tests on different platforms, we need to guarantee the same order of identical
atoms originating from different source ALTs. Even though they are consistent, different values can be
Expand Down Expand Up @@ -240,7 +271,14 @@ static void _split_table_new(abuf_t *buf, atom_t *atom)
static void _split_table_overlap(abuf_t *buf, int iout, atom_t *atom)
{
uint8_t *ptr = buf->split.tbl + iout*buf->split.nori;
ptr[atom->ial-1] = _atoms_inconsistent(atom,buf->split.atoms[iout]) ? 2 : 1;
int olap = _atoms_overlap(atom,buf->split.atoms[iout]);
ptr[atom->ial-1] = olap > 1 ? 2 : 1;

// The test test/atomize.split.5.vcf shows why we sometimes can and sometimes
// cannot remove the star allele like this
// buf->split.overlaps[iout] = olap > 1 ? 1 : 0;
// I forgot the details of the code, so don't immediately see
// if this could be made smarter
buf->split.overlaps[iout] = 1;
}
#if 0
Expand Down Expand Up @@ -413,13 +451,21 @@ static void _split_table_set_info(abuf_t *buf, bcf_info_t *info, merge_rule_t mo
buf->tmp2 = dst.s;
ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, dst.l, type);
}
if ( ret!=0 ) error("An error occurred while updating INFO/%s\n",tag);
if ( ret!=0 ) error("An error occurred while updating INFO/%s (errcode=%d)\n",tag,ret);
}
}
static void _split_table_set_history(abuf_t *buf)
{
int i,j;
int i,j,ret;
bcf1_t *rec = buf->split.rec;

// Don't update if the tag already exists. This is to prevent -a from overwriting -m
int m = 0;
char *tmp = NULL;
ret = bcf_get_info_string(buf->hdr,rec,buf->split.info_tag,&tmp,&m);
free(tmp);
if ( ret>0 ) return;

buf->tmps.l = 0;
ksprintf(&buf->tmps,"%s|%"PRIhts_pos"|%s|",bcf_seqname(buf->hdr,rec),rec->pos+1,rec->d.allele[0]);
for (i=1; i<rec->n_allele; i++)
Expand All @@ -443,8 +489,8 @@ static void _split_table_set_history(abuf_t *buf)
kputc(',',&buf->tmps);
}
buf->tmps.s[--buf->tmps.l] = 0;
if ( (bcf_update_info_string(buf->out_hdr, out, buf->split.info_tag, buf->tmps.s))!=0 )
error("An error occurred while updating INFO/%s\n",buf->split.info_tag);
if ( (ret=bcf_update_info_string(buf->out_hdr, out, buf->split.info_tag, buf->tmps.s))!=0 )
error("An error occurred while updating INFO/%s (errcode=%d)\n",buf->split.info_tag,ret);
}
}
static void _split_table_set_gt(abuf_t *buf)
Expand Down Expand Up @@ -670,7 +716,7 @@ static void _split_table_set_format(abuf_t *buf, bcf_fmt_t *fmt, merge_rule_t mo
#undef BRANCH
ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, 3*(1+star_allele)*nsmpl, type);
}
if ( ret!=0 ) error("An error occurred while updating FORMAT/%s\n",tag);
if ( ret!=0 ) error("An error occurred while updating FORMAT/%s (errcode=%d)\n",tag,ret);
}
}
static inline int _is_acgtn(char *seq)
Expand Down Expand Up @@ -739,7 +785,7 @@ void _abuf_split(abuf_t *buf, bcf1_t *rec)
_split_table_init(buf,rec,buf->natoms);
for (i=0; i<buf->natoms; i++)
{
if ( i && !_atoms_inconsistent(&buf->atoms[i-1],&buf->atoms[i]) ) continue;
if ( i && _atoms_inconsistent(&buf->atoms[i-1],&buf->atoms[i])==0 ) continue;
_split_table_new(buf, &buf->atoms[i]); // add a new unique output atom
}
for (i=0; i<buf->natoms; i++)
Expand Down
Loading

0 comments on commit d3d2733

Please sign in to comment.