Skip to content

Commit

Permalink
Write script to export submissions to gff3
Browse files Browse the repository at this point in the history
Signed-off-by: Harsh Gupta <[email protected]>
  • Loading branch information
hargup committed Jan 2, 2015
1 parent 90d740b commit 709ac04
Show file tree
Hide file tree
Showing 2 changed files with 163 additions and 0 deletions.
9 changes: 9 additions & 0 deletions Rakefile
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,15 @@ task 'import', [:annotations_file] do |t, args|
Importer.new(annotations_file).run
end

desc 'Export'
task 'export', [:annotations_file] do |t, args|
require_relative 'app'
App.init_config
App.load_services
annotations_file = args[:annotations_file]
Exporter.new(annotations_file).run
end

desc 'IRb.'
task 'irb' do
require_relative 'app'
Expand Down
154 changes: 154 additions & 0 deletions services/exporter.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
# This file creates a GFF3 file for all the annotation submissions
# corresponding to GFF file specified by the user. The GFF3 specification can
# be found on The Sequence Ontology Project's website [1]. To verify the
# generated GFF3 file use the online tool provided by genometools [2].
class Exporter

BLACKLISTED_TYPES = %w(non_canonical_splice_site
non_canonical_translation_start_site
non_canonical_translation_stop_site)
def initialize(annotations_file)
@annotations_file = annotations_file
species, asm_id = annotations_file.split('/')[-2..-1]
asm_id.sub!(/(\.gff)$/, '')
@genome = Genome.first species: species, asm_id: asm_id

@subfeature_count = Hash.new(0)
end

attr_reader :genome

# Returns all the submissions objects for a given GFF3 file.
def submissions
ref_seqs = @genome.ref_seqs.map(&:seq_id)
task_ids = Task.where(ref_seq_id: ref_seqs).map(&:id)
Submission.where task_id: task_ids
end

def run
gff_data = []
submissions.each do |submission|
@subfeature_count['gene'] += 1
# HACK:
gene_line = []
gene_line << feature_to_gff(submission.data['value'].first, 0, false, true)
gff_data << gene_line

submission.data['value'].each do |transcript|
gff_data << transcript_to_gff(transcript)
end
end
puts create_gff_file_from_data(gff_data)
end

def transcript_to_gff(transcript)
data = []
data << feature_to_gff(transcript)

phase = 0
transcript['subfeatures'].each do |subfeature|
unless BLACKLISTED_TYPES.include? subfeature['type']
data << feature_to_gff(subfeature, phase, true)
end

if subfeature['type'] == 'CDS'
start = subfeature['start']
stop = subfeature['end']
phase = (3 - (((stop + 1) - start - phase) % 3)) % 3
end
end

data
end

def feature_to_gff(subfeature, phase = nil, parent = false, gene = false)
gff_strand = {
1 => '+',
-1 => '-',
nil => '.'
}

if gene
type = 'gene'
else
type = subfeature['type'] == 'transcript' ? 'mRNA' : subfeature['type']
@subfeature_count[type] += 1
end

data = []

data << "#{subfeature['seq_id']}"
data << 'afra' # source
data << "#{type}"
data << "#{subfeature['start']}"
data << "#{subfeature['end']}"
data << '.' # score
data << "#{gff_strand[subfeature['strand']]}"

# phase
if type == 'CDS'
data << "#{phase}"
else
data << '.'
end

# attributes
attribute_hash = {
id: "ID=#{type}#{@subfeature_count[type]};",
name: "Name=#{subfeature['name']};"
}
unless gene
if parent
attribute_hash[:parent] = "Parent=mRNA#{@subfeature_count['mRNA']}"
else
attribute_hash[:parent] = "Parent=gene#{@subfeature_count['gene']}"
end
end

attribute = attribute_hash.values.join('')
data << attribute

data
end

def create_gff_file_from_data(gff_data)
sr_data = [] # sr: sequence-region
seq_id_index = {}

# Seperate the transcript data on the basis of their seq_id
gff_data.each do |transcript_data|
seq_id = transcript_data[0][0]
unless seq_id_index.keys.include? seq_id
seq_id_index[seq_id] = sr_data.length
sr_data << []
end

sr_data[seq_id_index[seq_id]] << transcript_data
end

sr_str = []
sr_data.each_with_index do |file, i|
sr_str << []
seq_id = seq_id_index.invert[i]
start = 1
stop = RefSeq.where(seq_id: seq_id).first[:length]
sr_str[i] << "##sequence-region #{seq_id} #{start} #{stop}"

file.each do |transcript_data|
sr_str[i] << transcript_data.map {|line| line.join("\t")}.join("\n")
end

# REVIEW: this is crazy if the this line is seperated by tabs instead of
# space then the validator marks it as wrong
sr_str[i] = sr_str[i].join("\n")
end

"##gff-version 3\n" + sr_str.join("\n")
end
end

# References
# ==========
#
# [1]: http://www.sequenceontology.org/gff3.shtml
# [2]: http://genometools.org/cgi-bin/gff3validator.cgi

0 comments on commit 709ac04

Please sign in to comment.