-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathfilter-high-prob-matches.sh
executable file
·98 lines (73 loc) · 2.39 KB
/
filter-high-prob-matches.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/bin/sh
# TODO:
# Accept 'cutoff' as a command-line parameter.
version="
filter-high-prob-matches.sh: part of the Personal Identification Pipeline
https://github.com/TeamErlich/personal-identification-pipeline
Copyright (C) 2016 Yaniv Erlich ([email protected])
All Rights Reserved.
This program is licensed under GPL version 3.
See LICENSE file for full details.
Version 0.4
"
CUTOFF=0.005
die()
{
BASE=$(basename "$0")
echo "$BASE: error: $*" >&2
exit 1
}
show_help_and_exit()
{
BASE=$(basename "$0")
echo "$version
This script filters a .matches files and keeps only matches
with high-probability.
Usage: $BASE [OPTIONS] FILE.matches
The .matches file is one generated by the following scripts:
calc-match-probs.py
calc-match-probs-parallel.sh
This script will generate the following files:
FILE.high-prob.max-prob-per-id = list of all samples in the input file,
and their maximum posterior probability value.
FILE.high-prob.high-prob-ids = list of samples which have maximum
posterior value > $CUTOFF .
FILE.high-prob = same format as '.matches', but containing only samples
that have posterior > $CUTOFF .
Options:
-h = This help screen
"
exit 0
}
##
## PRogram starts here
##
test "x$1" = "x-h" || test "x$1" = "x--help" && show_help_and_exit
input="$1"
test -z "$1" \
&& die "missing input file (.matches file from pipeline). "\
"Use -h for help"
test -e "$1" || die "input file '$1' not found"
output=${input%.matches}.high-prob
test -e "$output" && die "output file '$output already exists, aborting."
# Find maximum posterior value for each ID
datamash --headers groupby ref_id max posterior < "$input" \
> "$output.max-prob-per-id" \
|| die "failed to find max-probability per ID"
# Filter IDs which reach above P>0.005
awk -v CUTOFF="$CUTOFF" \
'NR==1 || $2>CUTOFF' \
< "$output.max-prob-per-id" > "$output.high-prob-ids" \
|| die "failed to filter high-probability IDs"
# Keep Matches only with these "high probability" IDs
awk -v IDS="$output.high-prob-ids" -- \
'BEGIN {
while (getline < IDS) {
ids[$1] = 1
}
}
NR==1 { print ; next }
$1 in ids { print }' "$input" > "$output.t" \
|| die "failed to filter high-probabilities matches"
mv -- "$output.t" "$output" \
|| die "failed to rename '$output.t' to '$output'"