From b4a32d6014fa24591c9a9c2688d975a6fd357b7e Mon Sep 17 00:00:00 2001
From: Robert <contact@rmusser.net>
Date: Sat, 4 May 2024 13:59:57 -0700
Subject: [PATCH] Making progress...

Added sanity checks for file existence, file naming(remove illegal windows filenames and normalize to ascii), cuda existence,
---
 .gitignore | Bin 6814 -> 6834 bytes
 diarize.py |  33 +++++++++------------------------
 2 files changed, 9 insertions(+), 24 deletions(-)

diff --git a/.gitignore b/.gitignore
index 6a00dd225faac9e0213cd69154f2005c5be28ee8..2f9281739171bfec1ce774a83882efcc0ad332e2 100644
GIT binary patch
delta 32
mcmbPdy2+I7{r<f`w2^6!lmM5OUU^~}mwr)dacNFTF&6;vYz-9v

delta 11
ScmdmFI?t5p|G$lFbEE(u9|b1>

diff --git a/diarize.py b/diarize.py
index 5fc9a61..274cb7a 100644
--- a/diarize.py
+++ b/diarize.py
@@ -105,7 +105,7 @@ def decide_cpugpu():
     processing_input = input("Would you like to use your GPU or CPU for transcription? (1)GPU/(2)CPU): ")
     if processing_choice == "gpu" and (processing_input.lower() == "gpu" or processing_input == "1"):
         print("You've chosen to use the GPU.")
-        processing_choice = "gpu"
+        processing_choice = "cuda"
     elif processing_input.lower() == "cpu" or processing_input == "2":
         print("You've chosen to use the CPU.")
         processing_choice = "cpu"
@@ -158,13 +158,16 @@ def create_download_directory(title):
         print(f"Directory already exists: {session_path}")
     return session_path
 
+
+
 def normalize_title(title):
     # Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters
     title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
-    # Remove or replace illegal characters
     title = title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('"', '').replace('*', '').replace('?', '').replace('<', '').replace('>', '').replace('|', '')
     return title
 
+
+
 def get_youtube(video_url):
     ydl_opts = {
         'format': 'bestaudio[ext=m4a]',
@@ -176,6 +179,8 @@ def get_youtube(video_url):
         info_dict = ydl.extract_info(video_url, download=False)
     return info_dict
 
+
+
 def download_video(video_url, download_path, info_dict):
     title = normalize_title(info_dict['title'])
     file_path = os.path.join(download_path, f"{title}.m4a")
@@ -230,16 +235,13 @@ def convert_to_wav(video_file_path, offset=0):
 
 
 
-
-
-
 # Transcribe .wav into .segments.json
 def speech_to_text(video_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False):
     print('loading faster_whisper model:', whisper_model)
     from faster_whisper import WhisperModel
     # printf(processing_choice)
     # 1 == GPU / 2 == CPU
-    model = WhisperModel(whisper_model, device=processing_choice)
+    model = WhisperModel(whisper_model, device=f"{processing_choice}")
     time_start = time.time()
     if(video_file_path == None):
         raise ValueError("Error no video input")
@@ -275,23 +277,14 @@ def speech_to_text(video_file_path, selected_source_lang='en', whisper_model='sm
             segments.append(chunk)
             i += 1
         print("transcribe audio done with fast whisper")
-
         with open(out_file,'w') as f:
             f.write(json.dumps(segments, indent=2))
-
     except Exception as e:
         raise RuntimeError("Error transcribing.")
-    
     return segments
 
 
 
-## Using Whisper.cpp
-# Get-Whisper-GGML.ps1
-# https://github.com/ggerganov/whisper.cpp/releases/latest
-
-
-
 # TODO: https://huggingface.co/pyannote/speaker-diarization-3.1
 # embedding_model = "pyannote/embedding", embedding_size=512
 # embedding_model = "speechbrain/spkrec-ecapa-voxceleb", embedding_size=192
@@ -408,19 +401,11 @@ def speaker_diarize(video_file_path, segments, embedding_model = "pyannote/embed
 
 
 def main(youtube_url: str, num_speakers: int = 2, whisper_model: str = "small.en", offset: int = 0, vad_filter : bool = False):
-#    if user_choice == '2':
-#        video_path = get_youtube(list_of_videos)
-#FIXME
-
-#    video_info = get_youtube(youtube_url)
-#    download_path = create_download_directory(video_info['title'])
-#    video_path = download_video(youtube_url, download_path)
-#
     info_dict = get_youtube(youtube_url)
     download_path = create_download_directory(info_dict['title'])
     video_path = download_video(youtube_url, download_path, info_dict)
-#
     audio_file = convert_to_wav(video_path, offset)
+#FIXME
     segments = speech_to_text(video_path, whisper_model=whisper_model, vad_filter=vad_filter)
 #    df_results, save_path = speaker_diarize(video_path, segments, num_speakers=num_speakers)
 #    print("diarize complete:", save_path)