-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.py
32 lines (26 loc) · 1.18 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import pandas as pd
from Bio import Seq, SeqIO
def write_fasta(df, file_path, abbr_columns=None):
"""
Save dataframe to a .fasta file, the df should contain at least columns named "Id" and "Sequence"
df: dataframe for saving .fasta
file_path: path(string) for saving the fasta file
abbr_columns: string columns for adding abbreviations. Multiple abbr are splited by '|'.
"""
Seqrecords = [SeqIO.SeqRecord(id=row['Id'],
seq=Seq.Seq(row['Sequence']),
description='|'.join(row[abbr_columns] if abbr_columns is not None else "")) \
for idn, row in df.iterrows()]
with open(file_path, 'w+') as fhandle:
SeqIO.write(Seqrecords, fhandle, "fasta-2line")
print("Saved {:d} sequences.".format(len(Seqrecords)))
def read_fasta(fname):
'''
Read fasta file to dictionary
Input: path name of fasta
Output: dataframe of Peptide Seq {ID1: Seq1, ID2: Seq2,...}
'''
with open(fname, "rU") as f:
seq_dict = [(record.id, record.seq._data) for record in SeqIO.parse(f, "fasta")]
seq_df = pd.DataFrame(data=seq_dict, columns=["Id", "Sequence"])
return seq_df