From 90038b23a6d6c9fd19ed6fd2d10d90a18f326ca7 Mon Sep 17 00:00:00 2001
From: Rubiksman78 <samuelabc78310@gmail.com>
Date: Mon, 2 Jan 2023 15:45:06 +0100
Subject: [PATCH] voicing

---
 AI_submod/main_voicing.rpy      |  50 ++++++++++++++
 combined_server.py              |   6 +-
 combined_server_for_the_bold.py |   9 +--
 download_models.py              |  27 ++++++++
 requirements.txt                |  10 +--
 setup_new_tts.sh                |   5 ++
 test_client.py                  |  43 ------------
 voicing.py                      | 112 ++++++++++++++++++++++++++++++++
 8 files changed, 198 insertions(+), 64 deletions(-)
 create mode 100644 AI_submod/main_voicing.rpy
 create mode 100644 download_models.py
 create mode 100644 setup_new_tts.sh
 delete mode 100644 test_client.py
 create mode 100644 voicing.py

diff --git a/AI_submod/main_voicing.rpy b/AI_submod/main_voicing.rpy
new file mode 100644
index 0000000..bbd52d8
--- /dev/null
+++ b/AI_submod/main_voicing.rpy
@@ -0,0 +1,50 @@
+init 5 python in mas_voice:
+    import subprocess
+    from threading import Thread
+    import store
+    import os
+    import sys
+    import re
+    import time
+    import Queue
+    import socket
+    from socket import AF_INET, SOCK_STREAM
+    from threading import Thread,current_thread
+    import select
+    from time import sleep
+    import subprocess
+
+    renpy.music.register_channel("mvoice", mixer= "sfx", loop=False)
+
+    def receiveMessage():
+        msg = client_socket.recv(BUFSIZ).decode("utf8")
+        return msg
+
+    def send_simple(prefix):
+        client_socket.send(bytes(prefix).encode("utf8"))
+
+    def audio_file_exists(filename):
+        return os.path.isfile(filename)
+
+    HOST = '127.0.0.1'
+    PORT = 12344
+
+    BUFSIZ = 1024
+    ADDR = (HOST, PORT)
+    client_socket = socket.socket(AF_INET, SOCK_STREAM)
+
+    try:
+        client_socket.connect(ADDR)
+        connected = True
+    except:
+        connected = False
+
+    def get_wrapped_say(func):
+        def new_say(who, what, interact=True, *args, **kwargs):
+            speaktext = renpy.substitute(what)
+            send_simple(speaktext)
+            func(who, what, interact=interact, *args, **kwargs)
+        return new_say
+
+    if connected:
+        renpy.say = get_wrapped_say(renpy.say)
diff --git a/combined_server.py b/combined_server.py
index f45de88..f04a028 100644
--- a/combined_server.py
+++ b/combined_server.py
@@ -19,7 +19,6 @@
 from PIL import Image
 from torchvision import transforms
 
-#from speech_to_text import stt
 import speech_recognition as sr
 import whisper
 
@@ -41,7 +40,7 @@
                     help='use chatbot')
 parser.add_argument('--use_emotion_detection', type=bool, default=True, 
                     help='use emotion detection')
-parser.add_argument('--use_audio', type=bool, default=True,
+parser.add_argument('--use_audio', type=bool, default=False,
                     help='use audio')
 parser.add_argument('--emotion_time', type=int, default=10,
                     help='time between camera captures')
@@ -177,7 +176,7 @@ def call(client):
     loop.close()
 
 #Launch the game
-subprocess.Popen(GAME_PATH+'\DDLC.exe')
+#subprocess.Popen(GAME_PATH+'\DDLC.exe')
 
 async def listenToClient(client):
     """ Get client username """
@@ -200,7 +199,6 @@ async def listenToClient(client):
 
             #Speech to text
             if received_msg == "begin_record":
-                #received_msg = stt()
 
                 with sr.Microphone(sample_rate=16000) as source:
                     sendMessage("yes".encode("utf-8"))
diff --git a/combined_server_for_the_bold.py b/combined_server_for_the_bold.py
index 4bfa476..06b4cb3 100644
--- a/combined_server_for_the_bold.py
+++ b/combined_server_for_the_bold.py
@@ -19,7 +19,6 @@
 from PIL import Image
 from torchvision import transforms
 
-from speech_to_text import stt
 import speech_recognition as sr
 import whisper
 
@@ -218,7 +217,6 @@ async def listenToClient(client):
 
             #Speech to text
             if received_msg == "begin_record":
-                #received_msg = stt()
 
                 with sr.Microphone(sample_rate=16000) as source:
                     sendMessage("yes".encode("utf-8"))
@@ -297,12 +295,7 @@ async def listenToClient(client):
                                 msg_audio = msg_audio.replace("{i}","")
                                 msg_audio = msg_audio.replace("{/i}",".")
                                 msg_audio = msg_audio.replace("~","!")
-                                # subprocess.check_call(['tts', '--text', msg_audio, '--model_name', 'tts_models/multilingual/multi-dataset/your_tts', '--speaker_wav', 'audios/talk_13.wav', '--language_idx', 'en', '--out_path', GAME_PATH + '/game/Submods/AI_submod/audio/out.wav'])
-                                # f = open(GAME_PATH+'/game/Submods/AI_submod/audio/out.wav', 'rb')
-                                # AudioSegment.from_wav(f).export(GAME_PATH+'/game/Submods/AI_submod/audio/out.ogg', format='ogg')
-                                # f.close()
-                                # os.remove(GAME_PATH+'/game/Submods/AI_submod/audio/out.wav')
-
+                              
                                 spec, audio = infer(spec_model, vocoder,msg_audio)
                                 audio = ipd.Audio(audio, rate=22050)
                                 audio = AudioSegment(audio.data, frame_rate=22050, sample_width=2, channels=1)
diff --git a/download_models.py b/download_models.py
new file mode 100644
index 0000000..961b80a
--- /dev/null
+++ b/download_models.py
@@ -0,0 +1,27 @@
+import torch
+import torch
+
+import speech_recognition as sr
+import whisper
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+###Load the emotion model#####
+emotion_model = torch.load('models/enet_b2_7.pt').to(device)
+
+###Load the speech recognizer#####
+english = True
+def init_stt(model="base", english=True,energy=300, pause=0.8, dynamic_energy=False):
+    if model != "large" and english:
+        model = model + ".en"
+    audio_model = whisper.load_model(model)    
+    
+    #load the speech recognizer and set the initial energy threshold and pause threshold
+    r = sr.Recognizer()
+    r.energy_threshold = energy
+    r.pause_threshold = pause
+    r.dynamic_energy_threshold = dynamic_energy
+
+    return r,audio_model
+
+r,audio_model = init_stt()
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 0266223..c8ee9d5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,12 +19,4 @@ pyaudio
 SpeechRecognition
 git+https://github.com/openai/whisper.git
 #New TTS
-# git+https://github.com/NVIDIA/NeMo.git
-# omegaconf
-# hydra-core
-# pytorch-lightning
-# sentencepiece
-# youtokentome
-# wget
-#pynini==2.1.4 #on windows, download https://www.openfst.org/twiki/pub/GRM/PyniniDownload/pynini-2.1.5.tar.gz and install it manually (need Microsoft Visual C++ 14.0 https://aka.ms/vs/17/release/vs_BuildTools.exe)
-# nemo_toolkit['all']
+#simpleaudio
diff --git a/setup_new_tts.sh b/setup_new_tts.sh
new file mode 100644
index 0000000..0fb2e82
--- /dev/null
+++ b/setup_new_tts.sh
@@ -0,0 +1,5 @@
+apt-get install sox libsndfile1 ffmpeg
+pip install wget text-unidecode pynini==2.1.4
+python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]
+wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/nemo_text_processing/install_pynini.sh
+bash install_pynini.sh
\ No newline at end of file
diff --git a/test_client.py b/test_client.py
deleted file mode 100644
index 793afd7..0000000
--- a/test_client.py
+++ /dev/null
@@ -1,43 +0,0 @@
-emotions = ["Angry", "Disgusted", "Fearful", "Happy", "Neutral", "Sad", "Surprised"]
-from socket import AF_INET, socket, SOCK_STREAM
-from threading import Thread
-
-def receiveMessage():
-    msg = client_socket.recv(BUFSIZ).decode("utf8")
-    return msg
-
-def sendMessage(prefix,step):
-    my_msg = input(prefix)
-    client_socket.send(bytes(my_msg + "/g" + step).encode("utf8"))
-    return my_msg
-
-HOST = "127.0.0.1"  
-PORT = 12346
-
-BUFSIZ = 1024
-ADDR = (HOST, PORT)
-client_socket = socket(AF_INET, SOCK_STREAM)
-client_socket.connect(ADDR)
-
-while True:
-    received_emotion = receiveMessage()
-
-    if received_emotion == "angry":
-        print("You are angry")
-    elif received_emotion == "disgusted":
-        print("You are disgusted")
-    elif received_emotion == "fearful":
-        print("You are fearful")
-    elif received_emotion == "happy":
-        print("You are happy")
-    elif received_emotion == "neutral":
-        print("You are neutral")
-    elif received_emotion == "sad":
-        print("You are sad")
-    elif received_emotion == "surprised":
-        print("You are surprised")
-    elif received_emotion == "confused":
-        print("You are confused")
-    else:
-        print("I can't detect your emotion")
-    
\ No newline at end of file
diff --git a/voicing.py b/voicing.py
new file mode 100644
index 0000000..ed8d8a0
--- /dev/null
+++ b/voicing.py
@@ -0,0 +1,112 @@
+from socket import AF_INET, SOCK_STREAM
+import socket
+from threading import Thread
+import torch
+import nest_asyncio
+import IPython.display as ipd
+import simpleaudio as sa
+import re
+import asyncio
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+nest_asyncio.apply()
+
+from new_tts_infer import infer,get_best_ckpt_from_last_run
+from nemo.collections.tts.models import HifiGanModel
+from nemo.collections.tts.models import FastPitchModel
+
+GAME_PATH = "/mnt/c/SAMUEL/ddlc-win/DDLC-1.1.1-pc"
+#vocoder = HifiGanModel.from_pretrained("tts_hifigan")
+vocoder = HifiGanModel.load_from_checkpoint("hifigan_ft/HifiGan/2023-01-02_14-20-41/checkpoints/HifiGan--val_loss=0.4200-epoch=167-last.ckpt")
+vocoder = vocoder.eval().to(device)
+
+new_speaker_id = 9017
+duration_mins = 5
+mixing = False
+original_speaker_id = "ljspeech"
+
+last_ckpt = get_best_ckpt_from_last_run("./", new_speaker_id, duration_mins, mixing, original_speaker_id)
+print(last_ckpt)
+
+spec_model = FastPitchModel.load_from_checkpoint(last_ckpt)
+spec_model.eval().to(device)
+
+# Global variables
+clients = {}
+addresses = {}
+
+HOST = '127.0.0.1'
+#HOST = socket.gethostbyname(socket.gethostname())
+print(HOST)
+PORT = 12344
+BUFSIZE = 1024
+ADDRESS = (HOST, PORT)
+try:
+    SERVER=socket.socket(AF_INET, SOCK_STREAM)
+    print("Socket creation successful")
+except:
+    print("Socket creation Failed")
+SERVER.bind(ADDRESS)
+
+def listen():
+	""" Wait for incoming connections """
+	print("Waiting for connection...")
+	while True:
+		client, client_address = SERVER.accept()
+		print("%s:%s has connected." % client_address)
+		addresses[client] = client_address
+		Thread(target = call, args = (client,)).start()
+
+def call(client):
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+
+    loop.run_until_complete(listenToClient(client))
+    loop.close()
+
+#Clean the log.txt file
+with open("log.txt", "w") as f:
+    f.write("")
+
+async def listenToClient(client):
+    """ Get client username """
+    name = "User"
+    clients[client] = name
+    step = 0
+    sentence_list = []
+    while True:
+        msg = client.recv(BUFSIZE).decode("utf-8")         
+
+        if msg != "":
+            if step > 0:
+                play_obj.stop()
+                if msg_audio == sentence_list[-1]:
+                    continue
+            msg_audio = msg.replace("\n"," ")
+            msg_audio = msg_audio.replace("{i}","")
+            msg_audio = msg_audio.replace("{/i}",".")
+            msg_audio = msg_audio.replace("~","!")
+            #remove characters in {} and []
+            msg_audio = re.sub(r'\{.*?\}', '', msg_audio)
+
+            print(repr(msg_audio))
+            sentence_list.append(msg)
+            
+            spec, audio = infer(spec_model, vocoder,msg_audio)
+            audio = ipd.Audio(audio, rate=22050)
+            play_obj = sa.play_buffer(audio.data, 1, 2, 22050)
+            step += 1
+    
+def sendMessage(msg, name=""):
+    """ send message to all users present in
+    the chat room"""
+    for client in clients:
+        client.send(bytes(name, "utf8") + msg)
+
+if __name__ == "__main__":
+    SERVER.listen(10)
+    ACCEPT_THREAD = Thread(target=listen)
+    ACCEPT_THREAD.start()
+    ACCEPT_THREAD.join()
+    SERVER.close()
\ No newline at end of file