Skip to content

Commit

Permalink
voicing
Browse files Browse the repository at this point in the history
  • Loading branch information
Rubiksman78 committed Jan 2, 2023
1 parent 51aee40 commit 90038b2
Show file tree
Hide file tree
Showing 8 changed files with 198 additions and 64 deletions.
50 changes: 50 additions & 0 deletions AI_submod/main_voicing.rpy
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
init 5 python in mas_voice:
import subprocess
from threading import Thread
import store
import os
import sys
import re
import time
import Queue
import socket
from socket import AF_INET, SOCK_STREAM
from threading import Thread,current_thread
import select
from time import sleep
import subprocess

renpy.music.register_channel("mvoice", mixer= "sfx", loop=False)

def receiveMessage():
msg = client_socket.recv(BUFSIZ).decode("utf8")
return msg

def send_simple(prefix):
client_socket.send(bytes(prefix).encode("utf8"))

def audio_file_exists(filename):
return os.path.isfile(filename)

HOST = '127.0.0.1'
PORT = 12344

BUFSIZ = 1024
ADDR = (HOST, PORT)
client_socket = socket.socket(AF_INET, SOCK_STREAM)

try:
client_socket.connect(ADDR)
connected = True
except:
connected = False

def get_wrapped_say(func):
def new_say(who, what, interact=True, *args, **kwargs):
speaktext = renpy.substitute(what)
send_simple(speaktext)
func(who, what, interact=interact, *args, **kwargs)
return new_say

if connected:
renpy.say = get_wrapped_say(renpy.say)
6 changes: 2 additions & 4 deletions combined_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from PIL import Image
from torchvision import transforms

#from speech_to_text import stt
import speech_recognition as sr
import whisper

Expand All @@ -41,7 +40,7 @@
help='use chatbot')
parser.add_argument('--use_emotion_detection', type=bool, default=True,
help='use emotion detection')
parser.add_argument('--use_audio', type=bool, default=True,
parser.add_argument('--use_audio', type=bool, default=False,
help='use audio')
parser.add_argument('--emotion_time', type=int, default=10,
help='time between camera captures')
Expand Down Expand Up @@ -177,7 +176,7 @@ def call(client):
loop.close()

#Launch the game
subprocess.Popen(GAME_PATH+'\DDLC.exe')
#subprocess.Popen(GAME_PATH+'\DDLC.exe')

async def listenToClient(client):
""" Get client username """
Expand All @@ -200,7 +199,6 @@ async def listenToClient(client):

#Speech to text
if received_msg == "begin_record":
#received_msg = stt()

with sr.Microphone(sample_rate=16000) as source:
sendMessage("yes".encode("utf-8"))
Expand Down
9 changes: 1 addition & 8 deletions combined_server_for_the_bold.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from PIL import Image
from torchvision import transforms

from speech_to_text import stt
import speech_recognition as sr
import whisper

Expand Down Expand Up @@ -218,7 +217,6 @@ async def listenToClient(client):

#Speech to text
if received_msg == "begin_record":
#received_msg = stt()

with sr.Microphone(sample_rate=16000) as source:
sendMessage("yes".encode("utf-8"))
Expand Down Expand Up @@ -297,12 +295,7 @@ async def listenToClient(client):
msg_audio = msg_audio.replace("{i}","")
msg_audio = msg_audio.replace("{/i}",".")
msg_audio = msg_audio.replace("~","!")
# subprocess.check_call(['tts', '--text', msg_audio, '--model_name', 'tts_models/multilingual/multi-dataset/your_tts', '--speaker_wav', 'audios/talk_13.wav', '--language_idx', 'en', '--out_path', GAME_PATH + '/game/Submods/AI_submod/audio/out.wav'])
# f = open(GAME_PATH+'/game/Submods/AI_submod/audio/out.wav', 'rb')
# AudioSegment.from_wav(f).export(GAME_PATH+'/game/Submods/AI_submod/audio/out.ogg', format='ogg')
# f.close()
# os.remove(GAME_PATH+'/game/Submods/AI_submod/audio/out.wav')


spec, audio = infer(spec_model, vocoder,msg_audio)
audio = ipd.Audio(audio, rate=22050)
audio = AudioSegment(audio.data, frame_rate=22050, sample_width=2, channels=1)
Expand Down
27 changes: 27 additions & 0 deletions download_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import torch
import torch

import speech_recognition as sr
import whisper

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

###Load the emotion model#####
emotion_model = torch.load('models/enet_b2_7.pt').to(device)

###Load the speech recognizer#####
english = True
def init_stt(model="base", english=True,energy=300, pause=0.8, dynamic_energy=False):
if model != "large" and english:
model = model + ".en"
audio_model = whisper.load_model(model)

#load the speech recognizer and set the initial energy threshold and pause threshold
r = sr.Recognizer()
r.energy_threshold = energy
r.pause_threshold = pause
r.dynamic_energy_threshold = dynamic_energy

return r,audio_model

r,audio_model = init_stt()
10 changes: 1 addition & 9 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,4 @@ pyaudio
SpeechRecognition
git+https://github.com/openai/whisper.git
#New TTS
# git+https://github.com/NVIDIA/NeMo.git
# omegaconf
# hydra-core
# pytorch-lightning
# sentencepiece
# youtokentome
# wget
#pynini==2.1.4 #on windows, download https://www.openfst.org/twiki/pub/GRM/PyniniDownload/pynini-2.1.5.tar.gz and install it manually (need Microsoft Visual C++ 14.0 https://aka.ms/vs/17/release/vs_BuildTools.exe)
# nemo_toolkit['all']
#simpleaudio
5 changes: 5 additions & 0 deletions setup_new_tts.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
apt-get install sox libsndfile1 ffmpeg
pip install wget text-unidecode pynini==2.1.4
python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]
wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/nemo_text_processing/install_pynini.sh
bash install_pynini.sh
43 changes: 0 additions & 43 deletions test_client.py

This file was deleted.

112 changes: 112 additions & 0 deletions voicing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
from socket import AF_INET, SOCK_STREAM
import socket
from threading import Thread
import torch
import nest_asyncio
import IPython.display as ipd
import simpleaudio as sa
import re
import asyncio

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

nest_asyncio.apply()

from new_tts_infer import infer,get_best_ckpt_from_last_run
from nemo.collections.tts.models import HifiGanModel
from nemo.collections.tts.models import FastPitchModel

GAME_PATH = "/mnt/c/SAMUEL/ddlc-win/DDLC-1.1.1-pc"
#vocoder = HifiGanModel.from_pretrained("tts_hifigan")
vocoder = HifiGanModel.load_from_checkpoint("hifigan_ft/HifiGan/2023-01-02_14-20-41/checkpoints/HifiGan--val_loss=0.4200-epoch=167-last.ckpt")
vocoder = vocoder.eval().to(device)

new_speaker_id = 9017
duration_mins = 5
mixing = False
original_speaker_id = "ljspeech"

last_ckpt = get_best_ckpt_from_last_run("./", new_speaker_id, duration_mins, mixing, original_speaker_id)
print(last_ckpt)

spec_model = FastPitchModel.load_from_checkpoint(last_ckpt)
spec_model.eval().to(device)

# Global variables
clients = {}
addresses = {}

HOST = '127.0.0.1'
#HOST = socket.gethostbyname(socket.gethostname())
print(HOST)
PORT = 12344
BUFSIZE = 1024
ADDRESS = (HOST, PORT)
try:
SERVER=socket.socket(AF_INET, SOCK_STREAM)
print("Socket creation successful")
except:
print("Socket creation Failed")
SERVER.bind(ADDRESS)

def listen():
""" Wait for incoming connections """
print("Waiting for connection...")
while True:
client, client_address = SERVER.accept()
print("%s:%s has connected." % client_address)
addresses[client] = client_address
Thread(target = call, args = (client,)).start()

def call(client):
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)

loop.run_until_complete(listenToClient(client))
loop.close()

#Clean the log.txt file
with open("log.txt", "w") as f:
f.write("")

async def listenToClient(client):
""" Get client username """
name = "User"
clients[client] = name
step = 0
sentence_list = []
while True:
msg = client.recv(BUFSIZE).decode("utf-8")

if msg != "":
if step > 0:
play_obj.stop()
if msg_audio == sentence_list[-1]:
continue
msg_audio = msg.replace("\n"," ")
msg_audio = msg_audio.replace("{i}","")
msg_audio = msg_audio.replace("{/i}",".")
msg_audio = msg_audio.replace("~","!")
#remove characters in {} and []
msg_audio = re.sub(r'\{.*?\}', '', msg_audio)

print(repr(msg_audio))
sentence_list.append(msg)

spec, audio = infer(spec_model, vocoder,msg_audio)
audio = ipd.Audio(audio, rate=22050)
play_obj = sa.play_buffer(audio.data, 1, 2, 22050)
step += 1

def sendMessage(msg, name=""):
""" send message to all users present in
the chat room"""
for client in clients:
client.send(bytes(name, "utf8") + msg)

if __name__ == "__main__":
SERVER.listen(10)
ACCEPT_THREAD = Thread(target=listen)
ACCEPT_THREAD.start()
ACCEPT_THREAD.join()
SERVER.close()

0 comments on commit 90038b2

Please sign in to comment.