voicing

Rubiksman78 · Jan 2, 2023 · 90038b2 · 90038b2
1 parent 51aee40
commit 90038b2
Show file tree

Hide file tree

Showing 8 changed files with 198 additions and 64 deletions.
diff --git a/AI_submod/main_voicing.rpy b/AI_submod/main_voicing.rpy
@@ -0,0 +1,50 @@
+init 5 python in mas_voice:
+    import subprocess
+    from threading import Thread
+    import store
+    import os
+    import sys
+    import re
+    import time
+    import Queue
+    import socket
+    from socket import AF_INET, SOCK_STREAM
+    from threading import Thread,current_thread
+    import select
+    from time import sleep
+    import subprocess
+
+    renpy.music.register_channel("mvoice", mixer= "sfx", loop=False)
+
+    def receiveMessage():
+        msg = client_socket.recv(BUFSIZ).decode("utf8")
+        return msg
+
+    def send_simple(prefix):
+        client_socket.send(bytes(prefix).encode("utf8"))
+
+    def audio_file_exists(filename):
+        return os.path.isfile(filename)
+
+    HOST = '127.0.0.1'
+    PORT = 12344
+
+    BUFSIZ = 1024
+    ADDR = (HOST, PORT)
+    client_socket = socket.socket(AF_INET, SOCK_STREAM)
+
+    try:
+        client_socket.connect(ADDR)
+        connected = True
+    except:
+        connected = False
+
+    def get_wrapped_say(func):
+        def new_say(who, what, interact=True, *args, **kwargs):
+            speaktext = renpy.substitute(what)
+            send_simple(speaktext)
+            func(who, what, interact=interact, *args, **kwargs)
+        return new_say
+
+    if connected:
+        renpy.say = get_wrapped_say(renpy.say)
diff --git a/combined_server.py b/combined_server.py
@@ -19,7 +19,6 @@
 from PIL import Image
 from torchvision import transforms
 
-#from speech_to_text import stt
 import speech_recognition as sr
 import whisper
 
@@ -41,7 +40,7 @@
                     help='use chatbot')
 parser.add_argument('--use_emotion_detection', type=bool, default=True, 
                     help='use emotion detection')
-parser.add_argument('--use_audio', type=bool, default=True,
+parser.add_argument('--use_audio', type=bool, default=False,
                     help='use audio')
 parser.add_argument('--emotion_time', type=int, default=10,
                     help='time between camera captures')
@@ -177,7 +176,7 @@ def call(client):
     loop.close()
 
 #Launch the game
-subprocess.Popen(GAME_PATH+'\DDLC.exe')
+#subprocess.Popen(GAME_PATH+'\DDLC.exe')
 
 async def listenToClient(client):
     """ Get client username """
@@ -200,7 +199,6 @@ async def listenToClient(client):
 
             #Speech to text
             if received_msg == "begin_record":
-                #received_msg = stt()
 
                 with sr.Microphone(sample_rate=16000) as source:
                     sendMessage("yes".encode("utf-8"))

diff --git a/combined_server_for_the_bold.py b/combined_server_for_the_bold.py
@@ -19,7 +19,6 @@
 from PIL import Image
 from torchvision import transforms
 
-from speech_to_text import stt
 import speech_recognition as sr
 import whisper
 
@@ -218,7 +217,6 @@ async def listenToClient(client):
 
             #Speech to text
             if received_msg == "begin_record":
-                #received_msg = stt()
 
                 with sr.Microphone(sample_rate=16000) as source:
                     sendMessage("yes".encode("utf-8"))
@@ -297,12 +295,7 @@ async def listenToClient(client):
                                 msg_audio = msg_audio.replace("{i}","")
                                 msg_audio = msg_audio.replace("{/i}",".")
                                 msg_audio = msg_audio.replace("~","!")
-                                # subprocess.check_call(['tts', '--text', msg_audio, '--model_name', 'tts_models/multilingual/multi-dataset/your_tts', '--speaker_wav', 'audios/talk_13.wav', '--language_idx', 'en', '--out_path', GAME_PATH + '/game/Submods/AI_submod/audio/out.wav'])
-                                # f = open(GAME_PATH+'/game/Submods/AI_submod/audio/out.wav', 'rb')
-                                # AudioSegment.from_wav(f).export(GAME_PATH+'/game/Submods/AI_submod/audio/out.ogg', format='ogg')
-                                # f.close()
-                                # os.remove(GAME_PATH+'/game/Submods/AI_submod/audio/out.wav')
-
+
                                 spec, audio = infer(spec_model, vocoder,msg_audio)
                                 audio = ipd.Audio(audio, rate=22050)
                                 audio = AudioSegment(audio.data, frame_rate=22050, sample_width=2, channels=1)

diff --git a/download_models.py b/download_models.py
@@ -0,0 +1,27 @@
+import torch
+import torch
+
+import speech_recognition as sr
+import whisper
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+###Load the emotion model#####
+emotion_model = torch.load('models/enet_b2_7.pt').to(device)
+
+###Load the speech recognizer#####
+english = True
+def init_stt(model="base", english=True,energy=300, pause=0.8, dynamic_energy=False):
+    if model != "large" and english:
+        model = model + ".en"
+    audio_model = whisper.load_model(model)    
+
+    #load the speech recognizer and set the initial energy threshold and pause threshold
+    r = sr.Recognizer()
+    r.energy_threshold = energy
+    r.pause_threshold = pause
+    r.dynamic_energy_threshold = dynamic_energy
+
+    return r,audio_model
+
+r,audio_model = init_stt()
diff --git a/requirements.txt b/requirements.txt
@@ -19,12 +19,4 @@ pyaudio
 SpeechRecognition
 git+https://github.com/openai/whisper.git
 #New TTS
-# git+https://github.com/NVIDIA/NeMo.git
-# omegaconf
-# hydra-core
-# pytorch-lightning
-# sentencepiece
-# youtokentome
-# wget
-#pynini==2.1.4 #on windows, download https://www.openfst.org/twiki/pub/GRM/PyniniDownload/pynini-2.1.5.tar.gz and install it manually (need Microsoft Visual C++ 14.0 https://aka.ms/vs/17/release/vs_BuildTools.exe)
-# nemo_toolkit['all']
+#simpleaudio
diff --git a/setup_new_tts.sh b/setup_new_tts.sh
@@ -0,0 +1,5 @@
+apt-get install sox libsndfile1 ffmpeg
+pip install wget text-unidecode pynini==2.1.4
+python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]
+wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/nemo_text_processing/install_pynini.sh
+bash install_pynini.sh
diff --git a/test_client.py b/test_client.py
diff --git a/voicing.py b/voicing.py
@@ -0,0 +1,112 @@
+from socket import AF_INET, SOCK_STREAM
+import socket
+from threading import Thread
+import torch
+import nest_asyncio
+import IPython.display as ipd
+import simpleaudio as sa
+import re
+import asyncio
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+nest_asyncio.apply()
+
+from new_tts_infer import infer,get_best_ckpt_from_last_run
+from nemo.collections.tts.models import HifiGanModel
+from nemo.collections.tts.models import FastPitchModel
+
+GAME_PATH = "/mnt/c/SAMUEL/ddlc-win/DDLC-1.1.1-pc"
+#vocoder = HifiGanModel.from_pretrained("tts_hifigan")
+vocoder = HifiGanModel.load_from_checkpoint("hifigan_ft/HifiGan/2023-01-02_14-20-41/checkpoints/HifiGan--val_loss=0.4200-epoch=167-last.ckpt")
+vocoder = vocoder.eval().to(device)
+
+new_speaker_id = 9017
+duration_mins = 5
+mixing = False
+original_speaker_id = "ljspeech"
+
+last_ckpt = get_best_ckpt_from_last_run("./", new_speaker_id, duration_mins, mixing, original_speaker_id)
+print(last_ckpt)
+
+spec_model = FastPitchModel.load_from_checkpoint(last_ckpt)
+spec_model.eval().to(device)
+
+# Global variables
+clients = {}
+addresses = {}
+
+HOST = '127.0.0.1'
+#HOST = socket.gethostbyname(socket.gethostname())
+print(HOST)
+PORT = 12344
+BUFSIZE = 1024
+ADDRESS = (HOST, PORT)
+try:
+    SERVER=socket.socket(AF_INET, SOCK_STREAM)
+    print("Socket creation successful")
+except:
+    print("Socket creation Failed")
+SERVER.bind(ADDRESS)
+
+def listen():
+	""" Wait for incoming connections """
+	print("Waiting for connection...")
+	while True:
+		client, client_address = SERVER.accept()
+		print("%s:%s has connected." % client_address)
+		addresses[client] = client_address
+		Thread(target = call, args = (client,)).start()
+
+def call(client):
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+
+    loop.run_until_complete(listenToClient(client))
+    loop.close()
+
+#Clean the log.txt file
+with open("log.txt", "w") as f:
+    f.write("")
+
+async def listenToClient(client):
+    """ Get client username """
+    name = "User"
+    clients[client] = name
+    step = 0
+    sentence_list = []
+    while True:
+        msg = client.recv(BUFSIZE).decode("utf-8")         
+
+        if msg != "":
+            if step > 0:
+                play_obj.stop()
+                if msg_audio == sentence_list[-1]:
+                    continue
+            msg_audio = msg.replace("\n"," ")
+            msg_audio = msg_audio.replace("{i}","")
+            msg_audio = msg_audio.replace("{/i}",".")
+            msg_audio = msg_audio.replace("~","!")
+            #remove characters in {} and []
+            msg_audio = re.sub(r'\{.*?\}', '', msg_audio)
+
+            print(repr(msg_audio))
+            sentence_list.append(msg)
+
+            spec, audio = infer(spec_model, vocoder,msg_audio)
+            audio = ipd.Audio(audio, rate=22050)
+            play_obj = sa.play_buffer(audio.data, 1, 2, 22050)
+            step += 1
+
+def sendMessage(msg, name=""):
+    """ send message to all users present in
+    the chat room"""
+    for client in clients:
+        client.send(bytes(name, "utf8") + msg)
+
+if __name__ == "__main__":
+    SERVER.listen(10)
+    ACCEPT_THREAD = Thread(target=listen)
+    ACCEPT_THREAD.start()
+    ACCEPT_THREAD.join()
+    SERVER.close()