talk_net.py

import argparse
import glob
import math
import os
import pdb
import pickle
import subprocess
import sys
import time
import warnings
from shutil import rmtree

import cv2
import numpy
import python_speech_features
import torch
import tqdm
from scenedetect.detectors import ContentDetector
from scenedetect.frame_timecode import FrameTimecode
from scenedetect.scene_manager import SceneManager
from scenedetect.stats_manager import StatsManager
from scenedetect.video_manager import VideoManager
from scipy import signal
from scipy.interpolate import interp1d
from scipy.io import wavfile
from sklearn.metrics import accuracy_score, f1_score

from model.faceDetector.s3fd import S3FD
from talkNet import talkNet

warnings.filterwarnings("ignore")

parser = argparse.ArgumentParser(description="TalkNet Demo or Columnbia ASD Evaluation")

parser.add_argument("--videoName", type=str, default="001", help="Demo video name")
parser.add_argument(
    "--videoFolder", type=str, default="demo", help="Path for inputs, tmps and outputs"
)
parser.add_argument(
    "--pretrainModel",
    type=str,
    default="pretrain_TalkSet.model",
    help="Path for the pretrained TalkNet model",
)

parser.add_argument(
    "--nDataLoaderThread", type=int, default=10, help="Number of workers"
)
parser.add_argument(
    "--facedetScale",
    type=float,
    default=0.25,
    help="Scale factor for face detection, the frames will be scale to 0.25 orig",
)
parser.add_argument(
    "--minTrack", type=int, default=10, help="Number of min frames for each shot"
)
parser.add_argument(
    "--numFailedDet",
    type=int,
    default=10,
    help="Number of missed detections allowed before tracking is stopped",
)
parser.add_argument(
    "--minFaceSize", type=int, default=1, help="Minimum face size in pixels"
)
parser.add_argument("--cropScale", type=float, default=0.40, help="Scale bounding box")

parser.add_argument("--start", type=int, default=0, help="The start time of the video")
parser.add_argument(
    "--duration",
    type=int,
    default=0,
    help="The duration of the video, when set as 0, will extract the whole video",
)

parser.add_argument(
    "--evalCol",
    dest="evalCol",
    action="store_true",
    help="Evaluate on Columnbia dataset",
)
parser.add_argument(
    "--colSavePath",
    type=str,
    default="/data08/col",
    help="Path for inputs, tmps and outputs",
)

args = parser.parse_args()

if os.path.isfile(args.pretrainModel) == False:  # Download the pretrained model
    Link = "1AbN9fCf9IexMxEKXLQY2KYBlb-IhSEea"
    cmd = "gdown --id %s -O %s" % (Link, args.pretrainModel)
    subprocess.call(cmd, shell=True, stdout=None)

if args.evalCol == True:
    # The process is: 1. download video and labels(I have modified the format of labels to make it easiler for using)
    # 	              2. extract audio, extract video frames
    #                 3. scend detection, face detection and face tracking
    #                 4. active speaker detection for the detected face clips
    #                 5. use iou to find the identity of each face clips, compute the F1 results
    # The step 1 to 3 will take some time (That is one-time process). It depends on your cpu and gpu speed. For reference, I used 1.5 hour
    # The step 4 and 5 need less than 10 minutes
    # Need about 20G space finally
    # ```
    args.videoName = "col"
    args.videoFolder = args.colSavePath
    args.savePath = os.path.join(args.videoFolder, args.videoName)
    args.videoPath = os.path.join(args.videoFolder, args.videoName + ".mp4")
    args.duration = 0
    if os.path.isfile(args.videoPath) == False:  # Download video
        link = "https://www.youtube.com/watch?v=6GzxbrO0DHM&t=2s"
        cmd = "youtube-dl -f best -o %s '%s'" % (args.videoPath, link)
        output = subprocess.call(cmd, shell=True, stdout=None)
    if os.path.isdir(args.videoFolder + "/col_labels") == False:  # Download label
        link = "1Tto5JBt6NsEOLFRWzyZEeV6kCCddc6wv"
        cmd = "gdown --id %s -O %s" % (link, args.videoFolder + "/col_labels.tar.gz")
        subprocess.call(cmd, shell=True, stdout=None)
        cmd = "tar -xzvf %s -C %s" % (
            args.videoFolder + "/col_labels.tar.gz",
            args.videoFolder,
        )
        subprocess.call(cmd, shell=True, stdout=None)
        os.remove(args.videoFolder + "/col_labels.tar.gz")
else:
    args.videoPath = glob.glob(os.path.join(args.videoFolder, args.videoName + ".*"))[0]
    args.savePath = os.path.join(args.videoFolder, args.videoName)


def scene_detect(args):
    # CPU: Scene detection, output is the list of each shot's time duration
    videoManager = VideoManager([args.videoFilePath])
    statsManager = StatsManager()
    sceneManager = SceneManager(statsManager)
    sceneManager.add_detector(ContentDetector())
    baseTimecode = videoManager.get_base_timecode()
    videoManager.set_downscale_factor()
    videoManager.start()
    sceneManager.detect_scenes(frame_source=videoManager)
    sceneList = sceneManager.get_scene_list(baseTimecode)
    savePath = os.path.join(args.pyworkPath, "scene.pckl")
    if sceneList == []:
        sceneList = [
            (videoManager.get_base_timecode(), videoManager.get_current_timecode())
        ]
    with open(savePath, "wb") as fil:
        pickle.dump(sceneList, fil)
        sys.stderr.write(
            "%s - scenes detected %d\n" % (args.videoFilePath, len(sceneList))
        )
    return sceneList


def inference_video(args):
    # GPU: Face detection, output is the list contains the face location and score in this frame
    DET = S3FD(device="cuda")
    flist = glob.glob(os.path.join(args.pyframesPath, "*.jpg"))
    flist.sort()
    dets = []
    for fidx, fname in enumerate(flist):
        image = cv2.imread(fname)
        imageNumpy = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        bboxes = DET.detect_faces(imageNumpy, conf_th=0.9, scales=[args.facedetScale])
        dets.append([])
        for bbox in bboxes:
            dets[-1].append(
                {"frame": fidx, "bbox": (bbox[:-1]).tolist(), "conf": bbox[-1]}
            )  # dets has the frames info, bbox info, conf info
        sys.stderr.write(
            "%s-%05d; %d dets\r" % (args.videoFilePath, fidx, len(dets[-1]))
        )
    savePath = os.path.join(args.pyworkPath, "faces.pckl")
    with open(savePath, "wb") as fil:
        pickle.dump(dets, fil)
    return dets


def bb_intersection_over_union(boxA, boxB, evalCol=False):
    # CPU: IOU Function to calculate overlap between two image
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    interArea = max(0, xB - xA) * max(0, yB - yA)
    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
    if evalCol == True:
        iou = interArea / float(boxAArea)
    else:
        iou = interArea / float(boxAArea + boxBArea - interArea)
    return iou


def track_shot(args, sceneFaces):
    # CPU: Face tracking
    iouThres = 0.5  # Minimum IOU between consecutive face detections
    tracks = []
    while True:
        track = []
        for frameFaces in sceneFaces:
            for face in frameFaces:
                if track == []:
                    track.append(face)
                    frameFaces.remove(face)
                elif face["frame"] - track[-1]["frame"] <= args.numFailedDet:
                    iou = bb_intersection_over_union(face["bbox"], track[-1]["bbox"])
                    if iou > iouThres:
                        track.append(face)
                        frameFaces.remove(face)
                        continue
                else:
                    break
        if track == []:
            break
        elif len(track) > args.minTrack:
            frameNum = numpy.array([f["frame"] for f in track])
            bboxes = numpy.array([numpy.array(f["bbox"]) for f in track])
            frameI = numpy.arange(frameNum[0], frameNum[-1] + 1)
            bboxesI = []
            for ij in range(0, 4):
                interpfn = interp1d(frameNum, bboxes[:, ij])
                bboxesI.append(interpfn(frameI))
            bboxesI = numpy.stack(bboxesI, axis=1)
            if (
                max(
                    numpy.mean(bboxesI[:, 2] - bboxesI[:, 0]),
                    numpy.mean(bboxesI[:, 3] - bboxesI[:, 1]),
                )
                > args.minFaceSize
            ):
                tracks.append({"frame": frameI, "bbox": bboxesI})
    return tracks


def crop_video(args, track, cropFile):
    # CPU: crop the face clips
    flist = glob.glob(os.path.join(args.pyframesPath, "*.jpg"))  # Read the frames
    flist.sort()
    vOut = cv2.VideoWriter(
        cropFile + "t.avi", cv2.VideoWriter_fourcc(*"XVID"), 25, (224, 224)
    )  # Write video
    dets = {"x": [], "y": [], "s": []}
    for det in track["bbox"]:  # Read the tracks
        dets["s"].append(max((det[3] - det[1]), (det[2] - det[0])) / 2)
        dets["y"].append((det[1] + det[3]) / 2)  # crop center x
        dets["x"].append((det[0] + det[2]) / 2)  # crop center y
    dets["s"] = signal.medfilt(dets["s"], kernel_size=13)  # Smooth detections
    dets["x"] = signal.medfilt(dets["x"], kernel_size=13)
    dets["y"] = signal.medfilt(dets["y"], kernel_size=13)
    for fidx, frame in enumerate(track["frame"]):
        cs = args.cropScale
        bs = dets["s"][fidx]  # Detection box size
        bsi = int(bs * (1 + 2 * cs))  # Pad videos by this amount
        image = cv2.imread(flist[frame])
        frame = numpy.pad(
            image,
            ((bsi, bsi), (bsi, bsi), (0, 0)),
            "constant",
            constant_values=(110, 110),
        )
        my = dets["y"][fidx] + bsi  # BBox center Y
        mx = dets["x"][fidx] + bsi  # BBox center X
        face = frame[
            int(my - bs) : int(my + bs * (1 + 2 * cs)),
            int(mx - bs * (1 + cs)) : int(mx + bs * (1 + cs)),
        ]
        vOut.write(cv2.resize(face, (224, 224)))
    audioTmp = cropFile + ".wav"
    audioStart = (track["frame"][0]) / 25
    audioEnd = (track["frame"][-1] + 1) / 25
    vOut.release()
    command = (
        "ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 -threads %d -ss %.3f -to %.3f %s -loglevel panic"
        % (args.audioFilePath, args.nDataLoaderThread, audioStart, audioEnd, audioTmp)
    )
    output = subprocess.call(command, shell=True, stdout=None)  # Crop audio file
    _, audio = wavfile.read(audioTmp)
    command = (
        "ffmpeg -y -i %st.avi -i %s -threads %d -c:v copy -c:a copy %s.avi -loglevel panic"
        % (cropFile, audioTmp, args.nDataLoaderThread, cropFile)
    )  # Combine audio and video file
    output = subprocess.call(command, shell=True, stdout=None)
    os.remove(cropFile + "t.avi")
    return {"track": track, "proc_track": dets}


def extract_MFCC(file, outPath):
    # CPU: extract mfcc
    sr, audio = wavfile.read(file)
    mfcc = python_speech_features.mfcc(audio, sr)  # (N_frames, 13)   [1s = 100 frames]
    featuresPath = os.path.join(outPath, file.split("/")[-1].replace(".wav", ".npy"))
    numpy.save(featuresPath, mfcc)


def evaluate_network(files, args):
    # GPU: active speaker detection by pretrained TalkNet
    s = talkNet()
    s.loadParameters(args.pretrainModel)
    sys.stderr.write("Model %s loaded from previous state! \r\n" % args.pretrainModel)
    s.eval()
    allScores = []
    # durationSet = {1,2,4,6} # To make the result more reliable
    # durationSet = {1,1,1,2,2,2,3,3,4,5,6} # Use this line can get more reliable result
    durationSet = {
        1,
    }  # Use this line can get more reliable result
    for file in tqdm.tqdm(files, total=len(files)):
        fileName = os.path.splitext(file.split("/")[-1])[0]  # Load audio and video
        _, audio = wavfile.read(os.path.join(args.pycropPath, fileName + ".wav"))
        audioFeature = python_speech_features.mfcc(
            audio, 16000, numcep=13, winlen=0.025, winstep=0.010
        )
        video = cv2.VideoCapture(os.path.join(args.pycropPath, fileName + ".avi"))
        videoFeature = []
        while video.isOpened():
            ret, frames = video.read()
            if ret == True:
                face = cv2.cvtColor(frames, cv2.COLOR_BGR2GRAY)
                face = cv2.resize(face, (224, 224))
                face = face[
                    int(112 - (112 / 2)) : int(112 + (112 / 2)),
                    int(112 - (112 / 2)) : int(112 + (112 / 2)),
                ]
                videoFeature.append(face)
            else:
                break
        video.release()
        videoFeature = numpy.array(videoFeature)
        length = min(
            (audioFeature.shape[0] - audioFeature.shape[0] % 4) / 100,
            videoFeature.shape[0] / 25,
        )
        audioFeature = audioFeature[: int(round(length * 100)), :]
        videoFeature = videoFeature[: int(round(length * 25)), :, :]
        allScore = []  # Evaluation use TalkNet
        for duration in durationSet:
            batchSize = int(math.ceil(length / duration))
            scores = []
            with torch.no_grad():
                for i in range(batchSize):
                    # inputA = (
                    #     torch.FloatTensor(
                    #         audioFeature[
                    #             i * duration * 100 : (i + 1) * duration * 100, :
                    #         ]
                    #     )
                    #     .unsqueeze(0)
                    #     .cuda()
                    # )
                    import numpy as np

                    zero_array = np.zeros_like(
                        audioFeature[i * duration * 100 : (i + 1) * duration * 100, :]
                    )
                    inputA = torch.FloatTensor(zero_array).unsqueeze(0).cuda()
                    inputV = (
                        torch.FloatTensor(
                            videoFeature[
                                i * duration * 25 : (i + 1) * duration * 25, :, :
                            ]
                        )
                        .unsqueeze(0)
                        .cuda()
                    )
                    embedA = s.model.forward_audio_frontend(inputA)
                    embedV = s.model.forward_visual_frontend(inputV)
                    embedA, embedV = s.model.forward_cross_attention(embedA, embedV)
                    out = s.model.forward_audio_visual_backend(embedA, embedV)
                    score = s.lossAV.forward(out, labels=None)
                    scores.extend(score)
            allScore.append(scores)
        allScore = numpy.round((numpy.mean(numpy.array(allScore), axis=0)), 1).astype(
            float
        )
        allScores.append(allScore)
    return allScores


def visualization(tracks, scores, args):
    # CPU: visulize the result for video format
    flist = glob.glob(os.path.join(args.pyframesPath, "*.jpg"))
    flist.sort()
    faces = [[] for i in range(len(flist))]
    for tidx, track in enumerate(tracks):
        score = scores[tidx]
        for fidx, frame in enumerate(track["track"]["frame"].tolist()):
            s = score[
                max(fidx - 2, 0) : min(fidx + 3, len(score) - 1)
            ]  # average smoothing
            s = numpy.mean(s)
            faces[frame].append(
                {
                    "track": tidx,
                    "score": float(s),
                    "s": track["proc_track"]["s"][fidx],
                    "x": track["proc_track"]["x"][fidx],
                    "y": track["proc_track"]["y"][fidx],
                }
            )
    firstImage = cv2.imread(flist[0])
    fw = firstImage.shape[1]
    fh = firstImage.shape[0]
    vOut = cv2.VideoWriter(
        os.path.join(args.pyaviPath, "video_only.avi"),
        cv2.VideoWriter_fourcc(*"XVID"),
        25,
        (fw, fh),
    )
    colorDict = {0: 0, 1: 255}
    for fidx, fname in tqdm.tqdm(enumerate(flist), total=len(flist)):
        image = cv2.imread(fname)
        score_max = -100
        score_max_index = -1
        for index, face in enumerate(faces[fidx]):
            # clr = colorDict[int((face["score"] >= 0))]
            if face["score"] > score_max:
                score_max_index = index
                score_max = face["score"]
        clrs = []
        for i in range(len(faces[fidx])):
            if i == score_max_index:
                clrs.append(255)
            clrs.append(0)
        for face, clr in zip(faces[fidx], clrs):
            # clr = colorDict[int((face["score"] >= 0))]
            txt = round(face["score"], 1)
            cv2.rectangle(
                image,
                (int(face["x"] - face["s"]), int(face["y"] - face["s"])),
                (int(face["x"] + face["s"]), int(face["y"] + face["s"])),
                (0, clr, 255 - clr),
                10,
            )
            cv2.putText(
                image,
                "%s" % (txt),
                (int(face["x"] - face["s"]), int(face["y"] - face["s"])),
                cv2.FONT_HERSHEY_SIMPLEX,
                1.5,
                (0, clr, 255 - clr),
                5,
            )
        vOut.write(image)
    vOut.release()
    command = (
        "ffmpeg -y -i %s -i %s -threads %d -c:v copy -c:a copy %s -loglevel panic"
        % (
            os.path.join(args.pyaviPath, "video_only.avi"),
            os.path.join(args.pyaviPath, "audio.wav"),
            args.nDataLoaderThread,
            os.path.join(args.pyaviPath, "video_out.avi"),
        )
    )
    output = subprocess.call(command, shell=True, stdout=None)


def evaluate_col_ASD(tracks, scores, args):
    txtPath = args.videoFolder + "/col_labels/fusion/*.txt"  # Load labels
    predictionSet = {}
    for name in {"long", "bell", "boll", "lieb", "sick", "abbas"}:
        predictionSet[name] = [[], []]
    dictGT = {}
    txtFiles = glob.glob("%s" % txtPath)
    for file in txtFiles:
        lines = open(file).read().splitlines()
        idName = file.split("/")[-1][:-4]
        for line in lines:
            data = line.split("\t")
            frame = int(int(data[0]) / 29.97 * 25)
            x1 = int(data[1])
            y1 = int(data[2])
            x2 = int(data[1]) + int(data[3])
            y2 = int(data[2]) + int(data[3])
            gt = int(data[4])
            if frame in dictGT:
                dictGT[frame].append([x1, y1, x2, y2, gt, idName])
            else:
                dictGT[frame] = [[x1, y1, x2, y2, gt, idName]]
    flist = glob.glob(os.path.join(args.pyframesPath, "*.jpg"))  # Load files
    flist.sort()
    faces = [[] for i in range(len(flist))]
    for tidx, track in enumerate(tracks):
        score = scores[tidx]
        for fidx, frame in enumerate(track["track"]["frame"].tolist()):
            s = numpy.mean(
                score[max(fidx - 2, 0) : min(fidx + 3, len(score) - 1)]
            )  # average smoothing
            faces[frame].append(
                {
                    "track": tidx,
                    "score": float(s),
                    "s": track["proc_track"]["s"][fidx],
                    "x": track["proc_track"]["x"][fidx],
                    "y": track["proc_track"]["y"][fidx],
                }
            )
    for fidx, fname in tqdm.tqdm(enumerate(flist), total=len(flist)):
        if fidx in dictGT:  # This frame has label
            for gtThisFrame in dictGT[fidx]:  # What this label is ?
                faceGT = gtThisFrame[0:4]
                labelGT = gtThisFrame[4]
                idGT = gtThisFrame[5]
                ious = []
                for face in faces[fidx]:  # Find the right face in my result
                    faceLocation = [
                        int(face["x"] - face["s"]),
                        int(face["y"] - face["s"]),
                        int(face["x"] + face["s"]),
                        int(face["y"] + face["s"]),
                    ]
                    faceLocation_new = [
                        int(face["x"] - face["s"]) // 2,
                        int(face["y"] - face["s"]) // 2,
                        int(face["x"] + face["s"]) // 2,
                        int(face["y"] + face["s"]) // 2,
                    ]
                    iou = bb_intersection_over_union(
                        faceLocation_new, faceGT, evalCol=True
                    )
                    if iou > 0.5:
                        ious.append([iou, round(face["score"], 2)])
                if len(ious) > 0:  # Find my result
                    ious.sort()
                    labelPredict = ious[-1][1]
                else:
                    labelPredict = 0
                x1 = faceGT[0]
                y1 = faceGT[1]
                width = faceGT[2] - faceGT[0]
                predictionSet[idGT][0].append(labelPredict)
                predictionSet[idGT][1].append(labelGT)
    names = ["long", "bell", "boll", "lieb", "sick", "abbas"]  # Evaluate
    names.sort()
    F1s = 0
    for i in names:
        scores = numpy.array(predictionSet[i][0])
        labels = numpy.array(predictionSet[i][1])
        scores = numpy.int64(scores > 0)
        F1 = f1_score(labels, scores)
        ACC = accuracy_score(labels, scores)
        if i != "abbas":
            F1s += F1
            print("%s, ACC:%.2f, F1:%.2f" % (i, 100 * ACC, 100 * F1))
    print("Average F1:%.2f" % (100 * (F1s / 5)))


# Main function
def main():
    # This preprocesstion is modified based on this [repository](https://github.com/joonson/syncnet_python).
    # ```
    # .
    # ├── pyavi
    # │   ├── audio.wav (Audio from input video)
    # │   ├── video.avi (Copy of the input video)
    # │   ├── video_only.avi (Output video without audio)
    # │   └── video_out.avi  (Output video with audio)
    # ├── pycrop (The detected face videos and audios)
    # │   ├── 000000.avi
    # │   ├── 000000.wav
    # │   ├── 000001.avi
    # │   ├── 000001.wav
    # │   └── ...
    # ├── pyframes (All the video frames in this video)
    # │   ├── 000001.jpg
    # │   ├── 000002.jpg
    # │   └── ...
    # └── pywork
    #     ├── faces.pckl (face detection result)
    #     ├── scene.pckl (scene detection result)
    #     ├── scores.pckl (ASD result)
    #     └── tracks.pckl (face tracking result)
    # ```

    # Initialization
    args.pyaviPath = os.path.join(args.savePath, "pyavi")
    args.pyframesPath = os.path.join(args.savePath, "pyframes")
    args.pyworkPath = os.path.join(args.savePath, "pywork")
    args.pycropPath = os.path.join(args.savePath, "pycrop")
    if os.path.exists(args.savePath):
        rmtree(args.savePath)
    os.makedirs(
        args.pyaviPath, exist_ok=True
    )  # The path for the input video, input audio, output video
    os.makedirs(args.pyframesPath, exist_ok=True)  # Save all the video frames
    os.makedirs(
        args.pyworkPath, exist_ok=True
    )  # Save the results in this process by the pckl method
    os.makedirs(
        args.pycropPath, exist_ok=True
    )  # Save the detected face clips (audio+video) in this process

    # Extract video
    args.videoFilePath = os.path.join(args.pyaviPath, "video.avi")
    # If duration did not set, extract the whole video, otherwise extract the video from 'args.start' to 'args.start + args.duration'
    if args.duration == 0:
        command = (
            "ffmpeg -y -i %s -qscale:v 2 -threads %d -async 1 -r 25 %s -loglevel panic"
            % (args.videoPath, args.nDataLoaderThread, args.videoFilePath)
        )
    else:
        command = (
            "ffmpeg -y -i %s -qscale:v 2 -threads %d -ss %.3f -to %.3f -async 1 -r 25 %s -loglevel panic"
            % (
                args.videoPath,
                args.nDataLoaderThread,
                args.start,
                args.start + args.duration,
                args.videoFilePath,
            )
        )
    subprocess.call(command, shell=True, stdout=None)
    sys.stderr.write(
        time.strftime("%Y-%m-%d %H:%M:%S")
        + " Extract the video and save in %s \r\n" % (args.videoFilePath)
    )

    # Extract audio
    args.audioFilePath = os.path.join(args.pyaviPath, "audio.wav")
    command = (
        "ffmpeg -y -i %s -qscale:a 0 -ac 1 -vn -threads %d -ar 16000 %s -loglevel panic"
        % (args.videoFilePath, args.nDataLoaderThread, args.audioFilePath)
    )
    subprocess.call(command, shell=True, stdout=None)
    sys.stderr.write(
        time.strftime("%Y-%m-%d %H:%M:%S")
        + " Extract the audio and save in %s \r\n" % (args.audioFilePath)
    )

    # Extract the video frames
    command = "ffmpeg -y -i %s -qscale:v 2 -threads %d -f image2 %s -loglevel panic" % (
        args.videoFilePath,
        args.nDataLoaderThread,
        os.path.join(args.pyframesPath, "%06d.jpg"),
    )
    subprocess.call(command, shell=True, stdout=None)
    sys.stderr.write(
        time.strftime("%Y-%m-%d %H:%M:%S")
        + " Extract the frames and save in %s \r\n" % (args.pyframesPath)
    )

    # Scene detection for the video frames
    scene = scene_detect(args)
    sys.stderr.write(
        time.strftime("%Y-%m-%d %H:%M:%S")
        + " Scene detection and save in %s \r\n" % (args.pyworkPath)
    )

    # Face detection for the video frames
    faces = inference_video(args)
    sys.stderr.write(
        time.strftime("%Y-%m-%d %H:%M:%S")
        + " Face detection and save in %s \r\n" % (args.pyworkPath)
    )

    # Face tracking
    allTracks, vidTracks = [], []
    for shot in scene:
        if (
            shot[1].frame_num - shot[0].frame_num >= args.minTrack
        ):  # Discard the shot frames less than minTrack frames
            allTracks.extend(
                track_shot(args, faces[shot[0].frame_num : shot[1].frame_num])
            )  # 'frames' to present this tracks' timestep, 'bbox' presents the location of the faces
    sys.stderr.write(
        time.strftime("%Y-%m-%d %H:%M:%S")
        + " Face track and detected %d tracks \r\n" % len(allTracks)
    )

    # Face clips cropping
    for ii, track in tqdm.tqdm(enumerate(allTracks), total=len(allTracks)):
        vidTracks.append(
            crop_video(args, track, os.path.join(args.pycropPath, "%05d" % ii))
        )
    savePath = os.path.join(args.pyworkPath, "tracks.pckl")
    with open(savePath, "wb") as fil:
        pickle.dump(vidTracks, fil)
    sys.stderr.write(
        time.strftime("%Y-%m-%d %H:%M:%S")
        + " Face Crop and saved in %s tracks \r\n" % args.pycropPath
    )
    fil = open(savePath, "rb")
    vidTracks = pickle.load(fil)

    # Active Speaker Detection by TalkNet
    files = glob.glob("%s/*.avi" % args.pycropPath)
    files.sort()
    scores = evaluate_network(files, args)
    savePath = os.path.join(args.pyworkPath, "scores.pckl")
    with open(savePath, "wb") as fil:
        pickle.dump(scores, fil)
    sys.stderr.write(
        time.strftime("%Y-%m-%d %H:%M:%S")
        + " Scores extracted and saved in %s \r\n" % args.pyworkPath
    )

    if args.evalCol == True:
        evaluate_col_ASD(
            vidTracks, scores, args
        )  # The columnbia video is too big for visualization. You can still add the `visualization` funcition here if you want
        quit()
    else:
        # Visualization, save the result as the new video
        visualization(vidTracks, scores, args)


if __name__ == "__main__":
    main()