From b4a32d6014fa24591c9a9c2688d975a6fd357b7e Mon Sep 17 00:00:00 2001 From: Robert Date: Sat, 4 May 2024 13:59:57 -0700 Subject: [PATCH] Making progress... Added sanity checks for file existence, file naming(remove illegal windows filenames and normalize to ascii), cuda existence, --- .gitignore | Bin 6814 -> 6834 bytes diarize.py | 33 +++++++++------------------------ 2 files changed, 9 insertions(+), 24 deletions(-) diff --git a/.gitignore b/.gitignore index 6a00dd225faac9e0213cd69154f2005c5be28ee8..2f9281739171bfec1ce774a83882efcc0ad332e2 100644 GIT binary patch delta 32 mcmbPdy2+I7{r diff --git a/diarize.py b/diarize.py index 5fc9a61..274cb7a 100644 --- a/diarize.py +++ b/diarize.py @@ -105,7 +105,7 @@ def decide_cpugpu(): processing_input = input("Would you like to use your GPU or CPU for transcription? (1)GPU/(2)CPU): ") if processing_choice == "gpu" and (processing_input.lower() == "gpu" or processing_input == "1"): print("You've chosen to use the GPU.") - processing_choice = "gpu" + processing_choice = "cuda" elif processing_input.lower() == "cpu" or processing_input == "2": print("You've chosen to use the CPU.") processing_choice = "cpu" @@ -158,13 +158,16 @@ def create_download_directory(title): print(f"Directory already exists: {session_path}") return session_path + + def normalize_title(title): # Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii') - # Remove or replace illegal characters title = title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('"', '').replace('*', '').replace('?', '').replace('<', '').replace('>', '').replace('|', '') return title + + def get_youtube(video_url): ydl_opts = { 'format': 'bestaudio[ext=m4a]', @@ -176,6 +179,8 @@ def get_youtube(video_url): info_dict = ydl.extract_info(video_url, download=False) return info_dict + + def download_video(video_url, download_path, info_dict): title = normalize_title(info_dict['title']) file_path = os.path.join(download_path, f"{title}.m4a") @@ -230,16 +235,13 @@ def convert_to_wav(video_file_path, offset=0): - - - # Transcribe .wav into .segments.json def speech_to_text(video_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False): print('loading faster_whisper model:', whisper_model) from faster_whisper import WhisperModel # printf(processing_choice) # 1 == GPU / 2 == CPU - model = WhisperModel(whisper_model, device=processing_choice) + model = WhisperModel(whisper_model, device=f"{processing_choice}") time_start = time.time() if(video_file_path == None): raise ValueError("Error no video input") @@ -275,23 +277,14 @@ def speech_to_text(video_file_path, selected_source_lang='en', whisper_model='sm segments.append(chunk) i += 1 print("transcribe audio done with fast whisper") - with open(out_file,'w') as f: f.write(json.dumps(segments, indent=2)) - except Exception as e: raise RuntimeError("Error transcribing.") - return segments -## Using Whisper.cpp -# Get-Whisper-GGML.ps1 -# https://github.com/ggerganov/whisper.cpp/releases/latest - - - # TODO: https://huggingface.co/pyannote/speaker-diarization-3.1 # embedding_model = "pyannote/embedding", embedding_size=512 # embedding_model = "speechbrain/spkrec-ecapa-voxceleb", embedding_size=192 @@ -408,19 +401,11 @@ def speaker_diarize(video_file_path, segments, embedding_model = "pyannote/embed def main(youtube_url: str, num_speakers: int = 2, whisper_model: str = "small.en", offset: int = 0, vad_filter : bool = False): -# if user_choice == '2': -# video_path = get_youtube(list_of_videos) -#FIXME - -# video_info = get_youtube(youtube_url) -# download_path = create_download_directory(video_info['title']) -# video_path = download_video(youtube_url, download_path) -# info_dict = get_youtube(youtube_url) download_path = create_download_directory(info_dict['title']) video_path = download_video(youtube_url, download_path, info_dict) -# audio_file = convert_to_wav(video_path, offset) +#FIXME segments = speech_to_text(video_path, whisper_model=whisper_model, vad_filter=vad_filter) # df_results, save_path = speaker_diarize(video_path, segments, num_speakers=num_speakers) # print("diarize complete:", save_path)