-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcgi_hmm.py
62 lines (53 loc) · 2.47 KB
/
cgi_hmm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import matplotlib.pyplot as plt
import numpy as np
from hmmlearn import hmm
np.random.seed(42)
def cgi_plot(states, cg_positions):
"""
Produce a basic plot showing where CpG dinucleotides occur
in a sequence, and what state the model is in.
"""
fig, ax = plt.subplots(figsize=(100, 10))
ax.plot(states)
ax.set_title('States over time')
ax.set_xlabel('Genome')
ax.set_ylabel('State')
cgs = np.argwhere(cg_positions.flatten()>0)
ax.plot(cgs.flatten(), [-0.2]*len(cgs),'|', color='k')
plt.yticks([-.2, 0, 1], ["CpGs", "bg", "Island"])
return plt
def seq_to_cg(sequence):
"""
Convert a DNA sequence into CpG-dinucleotide encoding, where 1 represents
a CpG and 0 represents anything else.
"""
cgmarks = np.zeros(len(sequence), dtype=np.uint32).reshape(len(sequence), 1)
for i in range(0, len(sequence)):
dinucleotide = sequence[i:(i+2)]
if dinucleotide == "GC":
cgmarks[i] = 1
else:
cgmarks[i] = 0
return cgmarks
# Parameterize a basic CpG island finder.
# It has been initialized with random parameters
gen_model = hmm.CategoricalHMM(n_components=2, random_state=99)
gen_model.startprob_ = np.array([0.5, 0.5])
gen_model.transmat_ = np.array([[0.5, 0.5],
[0.5, 0.5]])
gen_model.emissionprob_ = \
np.array([[50 / 100, 50 / 100],
[50 / 100, 50 / 100]])
# A dummy demo sequence for you to test on
S = "GCTGGCATCGCGCGTACTACTATCAGCGCGCGCGCGCGCGCGACGGCGAGCGCGACGCGGCCGGCAGCAGCGACGACGGACGCTATACTATCATGCTACGACGATCGTATCTATGCTGGCATCGCGCGTACTACTATCAGCGCGCGCGCGCGCGCGACGGCGAGCGCGACGCGGCCGGCAGCAGCGACGACGGACGCTATACTATCATGCTACGACGATCGTATCTATGCTGGCATCGCGCGTACTACTATCAGCGCGCGCGCGCGCGCGACGGCGAGCGCGACGCGGCCGGCAGCAGCGACGACGGACGCTATACTATCATGCTACGACGATCGTATCTATGCTGGCATCGCGCGTACTACTATCAGCGCGCGCGCGCGCGCGACGGCGAGCGCGACGCGGCCGGCAGCAGCGACGACGGACGCTATACTATCATGCTACGACGATCGTATCTATGCTGGCATCGCGCGTACTACTATCAGCGCGCGCGCGCGCGCGACGGCGAGCGCGACGCGGCCGGCAGCAGCGACGACGGACGCTATACTATCATGCTACGACGATCGTATCTATGCTGGCATCGCGCGTACTACTATCAGCGCGCGCGCGCGCGCGACGGCGAGCGCGACGCGGCCGGCAGCAGCGACGACGGACGCTATACTATCATGCTACGACGATCGTATCTAT"
# Find the viterbi parse of this sequence and plot it
# At first this will not yield a good segmentation; you will need to tune the
# above parameters by hand to get a parse you're happy with.
cgmarks = seq_to_cg(S)
viterbi_path = gen_model.predict(cgmarks)
plt = cgi_plot(viterbi_path, cgmarks)
plt.savefig("cgi.png")
# Read in a sequence from a fasta file like this:
with open("chunk.fa", "r") as f:
f.readline() # discard
S = f.read().replace("\n", "")