Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor to begin to productionize some of the pipelines #2

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 64 additions & 27 deletions example_data_access.ipynb

Large diffs are not rendered by default.

Empty file added force_align_pipeline.py
Empty file.
212 changes: 212 additions & 0 deletions models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
import sys; import os
import requests
import pickle
import subprocess
import hashlib
from collections import OrderedDict

import torch
import torchaudio
from bs4 import BeautifulSoup


class AudioData:
def __init__(self, cache_dir=''):
"""
In:
cache_dir: str, path to local files

"""
self.cache_dir = cache_dir # str
self.waveform = None # torch.Tensor
self.sample_rate = None # int


def slice_waveform(self, start_sec, close_sec):
""""""
s = round(start_sec*self.sample_rate)
c = round(close_sec*self.sample_rate)
slc = AudioData(cache_dir='temp')
slc.waveform = self.waveform[:, s:c]
slc.sample_rate = self.sample_rate
return slc


def write_wav(self, fp):
"""
Writes torchaudio file to wav format.

In:
fp: str, path to write

Returns:
None
"""
scipy.io.wavfile.write(
filename=fp,
rate=self.sample_rate,
data=self.waveform.numpy().transpose()
)

def load_wav(self, fp):
"""
Loads wav data from file.

In:
fp: str, path to wav file

Returns:
None
"""
self.waveform, self.sample_rate = torchaudio.load(fp)


class TranscriptData:
def __init__(self):
""""""
self.transcript = []
self.files = []
self.failures = []
self.driver_fp = '/Users/free-soellingeraj/Downloads/chromedriver'

def get_audio_url(self, codec='mp3'):
""""""
if not self.transcript:
raise Exception('No transcript available.')

if not self.files:
raise Exception('No files available in transcript.')

for url in self.files:
if codec in url:
return url

raise Exception(
'No file found with codec: {}.'.format(codec)
)


def get(self, transcript_fp):
""""""
with open(transcript_fp, 'rb') as f:
return pickle.load(f)



def write(self, data):
"""
Save transcript data to fn

In:
fn: str, a local filepath to write to

Returns:
None
"""
with open(self.fp, 'wb') as fout:
pickle.dump(data, fout)


class AudioDataBunch:
def __init__(self):
""""""
self.transcript = None
self.audio_data = None
self.bunch_sections = []


class TranscriptSection:
def __init__(self):
"""
transcript_section.keys():
['raw', 'case_name', 'conv_type', 'conv_date',
'speaker', 'start_time', 'stop_time', 'transcript']

"""
self.raw = None
self.parent_transcript = None
self.conv_type = None
self.conv_date = None
self.speaker = None
self.start_time = None
self.stop_time = None
self.text = None


def write_text(fp):
"""
Writes transcript text to txt file.

In:
fp: str, path to write

Returns:
None
"""
with open(fp, 'w') as f:
f.write(self.text)


class ForceAlignedSection:

def __init__(self):
""""""
path_to_gentle = '/Users/free-soellingeraj/code/gentle/align.py'

def force_align(tempdir, outdir, section_id, transcript_text,
audio_section, sample_rate):
"""
Force aligns transcript_text with audio waveform (audio_section).

Requires: lowerquality/gentle following installation procedures.

In:
tempdir: str, path that runtime has access to for temp storage.
outdir: str, path that the force-aligned output will be written
section_id: str,
{case_name}_{conv_type}_{speaker}_{start_time}_{stop_time}
transcript_text: str, transcription to align
audio_section: torch.Tensor,
audio waveform that contains transcription_text
sample_rate:

Note: process writes 2 temp files. 1 wav and 1 txt.

TODO: add log lines
"""
wav_fp = os.path.join(tempdir, 'temp_'+section_id+'.wav')
txt_fp = os.path.join(tempdir, 'temp_'+section_id+'.txt')
out_fp = os.path.join(outdir, section_id+'.json')
if os.path.exists(out_fp): return

write_wav(
fp=wav_fp,
sample_rate=sample_rate,
audio_section=audio_section
)
write_transcript_section(
fp=txt_fp,
transcript_text=transcript_text
)
cmd = ['python3',
self.path_to_gentle,
wav_fp, txt_fp, '-o', out_fp]
var = subprocess.call(
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
)
os.remove(wav_fp)
os.remove(txt_fp)
return var


class SectionBunch:
def __init__(self):
self.transcript_section = None
self.audio_data = None

def create(self, transcript_section, parent_audio_data):
self.transcript_section = transcript_section
self.audio_data = parent_audio_data.slice_waveform(
start_sec=float(self.transcript_section.section['start_time']),
close_sec=float(self.transcript_section.section['stop_time'])
)
Loading