From 90038b23a6d6c9fd19ed6fd2d10d90a18f326ca7 Mon Sep 17 00:00:00 2001 From: Rubiksman78 Date: Mon, 2 Jan 2023 15:45:06 +0100 Subject: [PATCH] voicing --- AI_submod/main_voicing.rpy | 50 ++++++++++++++ combined_server.py | 6 +- combined_server_for_the_bold.py | 9 +-- download_models.py | 27 ++++++++ requirements.txt | 10 +-- setup_new_tts.sh | 5 ++ test_client.py | 43 ------------ voicing.py | 112 ++++++++++++++++++++++++++++++++ 8 files changed, 198 insertions(+), 64 deletions(-) create mode 100644 AI_submod/main_voicing.rpy create mode 100644 download_models.py create mode 100644 setup_new_tts.sh delete mode 100644 test_client.py create mode 100644 voicing.py diff --git a/AI_submod/main_voicing.rpy b/AI_submod/main_voicing.rpy new file mode 100644 index 0000000..bbd52d8 --- /dev/null +++ b/AI_submod/main_voicing.rpy @@ -0,0 +1,50 @@ +init 5 python in mas_voice: + import subprocess + from threading import Thread + import store + import os + import sys + import re + import time + import Queue + import socket + from socket import AF_INET, SOCK_STREAM + from threading import Thread,current_thread + import select + from time import sleep + import subprocess + + renpy.music.register_channel("mvoice", mixer= "sfx", loop=False) + + def receiveMessage(): + msg = client_socket.recv(BUFSIZ).decode("utf8") + return msg + + def send_simple(prefix): + client_socket.send(bytes(prefix).encode("utf8")) + + def audio_file_exists(filename): + return os.path.isfile(filename) + + HOST = '127.0.0.1' + PORT = 12344 + + BUFSIZ = 1024 + ADDR = (HOST, PORT) + client_socket = socket.socket(AF_INET, SOCK_STREAM) + + try: + client_socket.connect(ADDR) + connected = True + except: + connected = False + + def get_wrapped_say(func): + def new_say(who, what, interact=True, *args, **kwargs): + speaktext = renpy.substitute(what) + send_simple(speaktext) + func(who, what, interact=interact, *args, **kwargs) + return new_say + + if connected: + renpy.say = get_wrapped_say(renpy.say) diff --git a/combined_server.py b/combined_server.py index f45de88..f04a028 100644 --- a/combined_server.py +++ b/combined_server.py @@ -19,7 +19,6 @@ from PIL import Image from torchvision import transforms -#from speech_to_text import stt import speech_recognition as sr import whisper @@ -41,7 +40,7 @@ help='use chatbot') parser.add_argument('--use_emotion_detection', type=bool, default=True, help='use emotion detection') -parser.add_argument('--use_audio', type=bool, default=True, +parser.add_argument('--use_audio', type=bool, default=False, help='use audio') parser.add_argument('--emotion_time', type=int, default=10, help='time between camera captures') @@ -177,7 +176,7 @@ def call(client): loop.close() #Launch the game -subprocess.Popen(GAME_PATH+'\DDLC.exe') +#subprocess.Popen(GAME_PATH+'\DDLC.exe') async def listenToClient(client): """ Get client username """ @@ -200,7 +199,6 @@ async def listenToClient(client): #Speech to text if received_msg == "begin_record": - #received_msg = stt() with sr.Microphone(sample_rate=16000) as source: sendMessage("yes".encode("utf-8")) diff --git a/combined_server_for_the_bold.py b/combined_server_for_the_bold.py index 4bfa476..06b4cb3 100644 --- a/combined_server_for_the_bold.py +++ b/combined_server_for_the_bold.py @@ -19,7 +19,6 @@ from PIL import Image from torchvision import transforms -from speech_to_text import stt import speech_recognition as sr import whisper @@ -218,7 +217,6 @@ async def listenToClient(client): #Speech to text if received_msg == "begin_record": - #received_msg = stt() with sr.Microphone(sample_rate=16000) as source: sendMessage("yes".encode("utf-8")) @@ -297,12 +295,7 @@ async def listenToClient(client): msg_audio = msg_audio.replace("{i}","") msg_audio = msg_audio.replace("{/i}",".") msg_audio = msg_audio.replace("~","!") - # subprocess.check_call(['tts', '--text', msg_audio, '--model_name', 'tts_models/multilingual/multi-dataset/your_tts', '--speaker_wav', 'audios/talk_13.wav', '--language_idx', 'en', '--out_path', GAME_PATH + '/game/Submods/AI_submod/audio/out.wav']) - # f = open(GAME_PATH+'/game/Submods/AI_submod/audio/out.wav', 'rb') - # AudioSegment.from_wav(f).export(GAME_PATH+'/game/Submods/AI_submod/audio/out.ogg', format='ogg') - # f.close() - # os.remove(GAME_PATH+'/game/Submods/AI_submod/audio/out.wav') - + spec, audio = infer(spec_model, vocoder,msg_audio) audio = ipd.Audio(audio, rate=22050) audio = AudioSegment(audio.data, frame_rate=22050, sample_width=2, channels=1) diff --git a/download_models.py b/download_models.py new file mode 100644 index 0000000..961b80a --- /dev/null +++ b/download_models.py @@ -0,0 +1,27 @@ +import torch +import torch + +import speech_recognition as sr +import whisper + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +###Load the emotion model##### +emotion_model = torch.load('models/enet_b2_7.pt').to(device) + +###Load the speech recognizer##### +english = True +def init_stt(model="base", english=True,energy=300, pause=0.8, dynamic_energy=False): + if model != "large" and english: + model = model + ".en" + audio_model = whisper.load_model(model) + + #load the speech recognizer and set the initial energy threshold and pause threshold + r = sr.Recognizer() + r.energy_threshold = energy + r.pause_threshold = pause + r.dynamic_energy_threshold = dynamic_energy + + return r,audio_model + +r,audio_model = init_stt() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 0266223..c8ee9d5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,12 +19,4 @@ pyaudio SpeechRecognition git+https://github.com/openai/whisper.git #New TTS -# git+https://github.com/NVIDIA/NeMo.git -# omegaconf -# hydra-core -# pytorch-lightning -# sentencepiece -# youtokentome -# wget -#pynini==2.1.4 #on windows, download https://www.openfst.org/twiki/pub/GRM/PyniniDownload/pynini-2.1.5.tar.gz and install it manually (need Microsoft Visual C++ 14.0 https://aka.ms/vs/17/release/vs_BuildTools.exe) -# nemo_toolkit['all'] +#simpleaudio diff --git a/setup_new_tts.sh b/setup_new_tts.sh new file mode 100644 index 0000000..0fb2e82 --- /dev/null +++ b/setup_new_tts.sh @@ -0,0 +1,5 @@ +apt-get install sox libsndfile1 ffmpeg +pip install wget text-unidecode pynini==2.1.4 +python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all] +wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/nemo_text_processing/install_pynini.sh +bash install_pynini.sh \ No newline at end of file diff --git a/test_client.py b/test_client.py deleted file mode 100644 index 793afd7..0000000 --- a/test_client.py +++ /dev/null @@ -1,43 +0,0 @@ -emotions = ["Angry", "Disgusted", "Fearful", "Happy", "Neutral", "Sad", "Surprised"] -from socket import AF_INET, socket, SOCK_STREAM -from threading import Thread - -def receiveMessage(): - msg = client_socket.recv(BUFSIZ).decode("utf8") - return msg - -def sendMessage(prefix,step): - my_msg = input(prefix) - client_socket.send(bytes(my_msg + "/g" + step).encode("utf8")) - return my_msg - -HOST = "127.0.0.1" -PORT = 12346 - -BUFSIZ = 1024 -ADDR = (HOST, PORT) -client_socket = socket(AF_INET, SOCK_STREAM) -client_socket.connect(ADDR) - -while True: - received_emotion = receiveMessage() - - if received_emotion == "angry": - print("You are angry") - elif received_emotion == "disgusted": - print("You are disgusted") - elif received_emotion == "fearful": - print("You are fearful") - elif received_emotion == "happy": - print("You are happy") - elif received_emotion == "neutral": - print("You are neutral") - elif received_emotion == "sad": - print("You are sad") - elif received_emotion == "surprised": - print("You are surprised") - elif received_emotion == "confused": - print("You are confused") - else: - print("I can't detect your emotion") - \ No newline at end of file diff --git a/voicing.py b/voicing.py new file mode 100644 index 0000000..ed8d8a0 --- /dev/null +++ b/voicing.py @@ -0,0 +1,112 @@ +from socket import AF_INET, SOCK_STREAM +import socket +from threading import Thread +import torch +import nest_asyncio +import IPython.display as ipd +import simpleaudio as sa +import re +import asyncio + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +nest_asyncio.apply() + +from new_tts_infer import infer,get_best_ckpt_from_last_run +from nemo.collections.tts.models import HifiGanModel +from nemo.collections.tts.models import FastPitchModel + +GAME_PATH = "/mnt/c/SAMUEL/ddlc-win/DDLC-1.1.1-pc" +#vocoder = HifiGanModel.from_pretrained("tts_hifigan") +vocoder = HifiGanModel.load_from_checkpoint("hifigan_ft/HifiGan/2023-01-02_14-20-41/checkpoints/HifiGan--val_loss=0.4200-epoch=167-last.ckpt") +vocoder = vocoder.eval().to(device) + +new_speaker_id = 9017 +duration_mins = 5 +mixing = False +original_speaker_id = "ljspeech" + +last_ckpt = get_best_ckpt_from_last_run("./", new_speaker_id, duration_mins, mixing, original_speaker_id) +print(last_ckpt) + +spec_model = FastPitchModel.load_from_checkpoint(last_ckpt) +spec_model.eval().to(device) + +# Global variables +clients = {} +addresses = {} + +HOST = '127.0.0.1' +#HOST = socket.gethostbyname(socket.gethostname()) +print(HOST) +PORT = 12344 +BUFSIZE = 1024 +ADDRESS = (HOST, PORT) +try: + SERVER=socket.socket(AF_INET, SOCK_STREAM) + print("Socket creation successful") +except: + print("Socket creation Failed") +SERVER.bind(ADDRESS) + +def listen(): + """ Wait for incoming connections """ + print("Waiting for connection...") + while True: + client, client_address = SERVER.accept() + print("%s:%s has connected." % client_address) + addresses[client] = client_address + Thread(target = call, args = (client,)).start() + +def call(client): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + loop.run_until_complete(listenToClient(client)) + loop.close() + +#Clean the log.txt file +with open("log.txt", "w") as f: + f.write("") + +async def listenToClient(client): + """ Get client username """ + name = "User" + clients[client] = name + step = 0 + sentence_list = [] + while True: + msg = client.recv(BUFSIZE).decode("utf-8") + + if msg != "": + if step > 0: + play_obj.stop() + if msg_audio == sentence_list[-1]: + continue + msg_audio = msg.replace("\n"," ") + msg_audio = msg_audio.replace("{i}","") + msg_audio = msg_audio.replace("{/i}",".") + msg_audio = msg_audio.replace("~","!") + #remove characters in {} and [] + msg_audio = re.sub(r'\{.*?\}', '', msg_audio) + + print(repr(msg_audio)) + sentence_list.append(msg) + + spec, audio = infer(spec_model, vocoder,msg_audio) + audio = ipd.Audio(audio, rate=22050) + play_obj = sa.play_buffer(audio.data, 1, 2, 22050) + step += 1 + +def sendMessage(msg, name=""): + """ send message to all users present in + the chat room""" + for client in clients: + client.send(bytes(name, "utf8") + msg) + +if __name__ == "__main__": + SERVER.listen(10) + ACCEPT_THREAD = Thread(target=listen) + ACCEPT_THREAD.start() + ACCEPT_THREAD.join() + SERVER.close() \ No newline at end of file