filter-high-prob-matches.sh

#!/bin/sh

# TODO:
# Accept 'cutoff' as a command-line parameter.

version="
filter-high-prob-matches.sh: part of the Personal Identification Pipeline
https://github.com/TeamErlich/personal-identification-pipeline

Copyright (C) 2016 Yaniv Erlich (yaniv@cs.columbia.edu)
All Rights Reserved.
This program is licensed under GPL version 3.
See LICENSE file for full details.

Version 0.4
"


CUTOFF=0.005

die()
{
    BASE=$(basename "$0")
    echo "$BASE: error: $*" >&2
    exit 1
}

show_help_and_exit()
{
    BASE=$(basename "$0")
    echo "$version

This script filters a .matches files and keeps only matches
with high-probability.

Usage: $BASE [OPTIONS] FILE.matches

The .matches file is one generated by the following scripts:
  calc-match-probs.py
  calc-match-probs-parallel.sh

This script will generate the following files:
  FILE.high-prob.max-prob-per-id = list of all samples in the input file,
                   and their maximum posterior probability value.

  FILE.high-prob.high-prob-ids = list of samples which have maximum
                   posterior value > $CUTOFF .

  FILE.high-prob = same format as '.matches', but containing only samples
                   that have posterior > $CUTOFF .

Options:
    -h     = This help screen

"

    exit 0
}


##
## PRogram starts here
##
test "x$1" = "x-h" || test "x$1" = "x--help" && show_help_and_exit

input="$1"
test -z "$1" \
    && die "missing input file (.matches file from pipeline). "\
           "Use -h for help"
test -e "$1" || die "input file '$1' not found"

output=${input%.matches}.high-prob
test -e "$output" && die "output file '$output already exists, aborting."

# Find maximum posterior value for each ID
datamash --headers groupby ref_id max posterior < "$input" \
    > "$output.max-prob-per-id" \
    || die "failed to find max-probability per ID"

# Filter IDs which reach above P>0.005
awk -v CUTOFF="$CUTOFF" \
    'NR==1 || $2>CUTOFF' \
    < "$output.max-prob-per-id" > "$output.high-prob-ids" \
    || die "failed to filter high-probability IDs"

# Keep Matches only with these "high probability" IDs
awk -v IDS="$output.high-prob-ids" -- \
    'BEGIN {
          while (getline < IDS) {
             ids[$1] = 1
          }
     }
     NR==1 { print ; next }
     $1 in ids { print }' "$input" > "$output.t" \
	 || die "failed to filter high-probabilities matches"

mv -- "$output.t" "$output" \
    || die "failed to rename '$output.t' to '$output'"