From 111e8d72a9e09e9d9b9d21d2a905e876c0dcc4d6 Mon Sep 17 00:00:00 2001 From: Robert Date: Tue, 7 May 2024 17:46:55 -0700 Subject: [PATCH] Can now download the actual videos if you want Can download full videos, not just audio, by passing '-v' or '--video' flag. Also added a 'list_of_videos.txt' for testing/validation. --- README.md | 60 ++++++++++++++++++-------- diarize.py | 105 +++++++++++++++++++++++++++++++++++---------- list_of_videos.txt | 3 ++ 3 files changed, 129 insertions(+), 39 deletions(-) create mode 100644 list_of_videos.txt diff --git a/README.md b/README.md index af01262..9c01313 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,28 @@ # TL/DW: Too Long, Didnt Watch -Take a URL, single video, list of URLs, or list of local videos + URLs and feed it into the script and have each video transcribed (and downloaded if not local) using faster-whisper. Transcriptions can then be shuffled off to an LLM API endpoint of your choice, whether that be local or remote. Any site supported by yt-dl is supported, so you can use this with sites besides just youtube. +Take a URL, single video, list of URLs, or list of local videos + URLs and feed it into the script and have each video transcribed (and audio downloaded if not local) using faster-whisper. Transcriptions can then be shuffled off to an LLM API endpoint of your choice, whether that be local or remote. Any site supported by yt-dl is supported, so you can use this with sites besides just youtube. -I personally recommend Sonnet, for the price it's very nice. +I personally recommend Sonnet, for the price, it's very nice. Original: `YouTube contains an incredible amount of knowledge, much of which is locked inside multi-hour videos. Let's extract and summarize it with AI!` ### tl/dr: Download Videos -> Transcribe -> Summarize. Scripted. -* Download->transcribe video from URL: `python diarize.py https://www.youtube.com/watch?v=4nd1CDZP21s` -* Download->transcribe->summarize using (`anthropic`/`cohere`/`openai`/`llama` - llama.cpp) API: `python diarize.py ./local/file_on_your/system --api_name ` +* Download Audio only from URL -> Transcribe audio: + * `python diarize.py https://www.youtube.com/watch?v=4nd1CDZP21s` +* Download Audio+Video from URL -> Transcribe audio from Video: + * `python diarize.py -v https://www.youtube.com/watch?v=4nd1CDZP21s` +* Download Audio only from URL -> Transcribe audio -> Summarize using (`anthropic`/`cohere`/`openai`/`llama` i.e. llama.cpp/`ooba`/`kobold`/`tabby`) API: + * `python diarize.py -v https://www.youtube.com/watch?v=4nd1CDZP21s -api ` +* Download Audio+Video from a list of videos in a text file (can be file paths or URLs) and have them all summarized: + * `python diarize.py ./local/file_on_your/system --api_name ` + - Use the script to transcribe a local file or remote url. * Any url youtube-dl supports _should_ work. - * If you pass an API name (openai/anthropic/cohere) as a second argument, and add your API key to the config file, you can have your resulting transcriptions summarized as well. - * The current approach to summarization is currently 'dumb'/naive, and will likely be replaced or additional functionality added to reflect actual practices and not just 'dump txt in and get an answer' approach. + * If you pass an API name (anthropic/cohere/grok/openai/) as a second argument, and add your API key to the config file, you can have your resulting transcriptions summarized as well. + * Alternatively, you can pass `llama`/`ooba`/`kobold`/`tabby` and have the script perform a request to your local API endpoint for summarization. You will need to modify the `llama_api_IP` value in the `config.txt` to reflect the `IP:Port` of your local server. + * Or pass the `--api_url` argument with the `IP:Port` to avoid making changes to the `config.txt` file. + * If the self-hosted server requires an API key, modify the appropriate api_key variable in the `config.txt` file. + * The current approach to summarization is currently 'dumb'/naive, and will likely be replaced or additional functionality added to reflect actual practices and not just 'dump txt in and get an answer' approach. This works for big context LLMs, but not everyone has access to them, and some transcriptions may be even longer, so we need to have an approach that can handle those cases. Save time and use the `config.txt` file, it allows you to set these settings and have them used when ran. ``` @@ -27,16 +37,19 @@ positional arguments: options: -h, --help show this help message and exit - --api_name API_NAME API name for summarization (optional) - --api_key API_KEY API key for summarization (optional) - --num_speakers NUM_SPEAKERS + -v, --video Download the video instead of just the audio + -name API_NAME, --api_name API_NAME + API name for summarization (optional) + -key API_KEY, --api_key API_KEY + API key for summarization (optional) - Please use the config file.... + -ns NUM_SPEAKERS, --num_speakers NUM_SPEAKERS Number of speakers (default: 2) - --whisper_model WHISPER_MODEL + -wm WHISPER_MODEL, --whisper_model WHISPER_MODEL Whisper model (default: small.en) - Available models: "`small`", "`medium`", "`small.en`","`medium.en`" - --offset OFFSET Offset in seconds (default: 0) - --vad_filter Enable VAD filter - --log_level {DEBUG,INFO,WARNING,ERROR,CRITICAL} + -off OFFSET, --offset OFFSET + Offset in seconds (default: 0) + -vad, --vad_filter Enable VAD filter + -log {DEBUG,INFO,WARNING,ERROR,CRITICAL}, --log_level {DEBUG,INFO,WARNING,ERROR,CRITICAL} Log level (default: INFO) >python diarize.py ./local/file_on_your/system --api_name anthropic @@ -54,7 +67,7 @@ By default videos, transcriptions and summaries are stored in a folder with the 1. Setup python + packages 2. Setup ffmpeg 3. Run `python diarize.py ` or `python diarize.py ` - 4. If you want summarization, add your API keys (if needed[is needed for now]) to the `config.txt` file, and then re-run the script, passing in the name of the API [or URL endpoint - to be added] to the script. + 4. If you want summarization, add your API keys (if not using a local LLM) to the `config.txt` file, and then re-run the script, passing in the name of the API [or URL endpoint - to be added] to the script. * `python diarize.py https://www.youtube.com/watch?v=4nd1CDZP21s --api_name anthropic` - This will attempt to download the video, then upload the resulting json file to the anthropic API endpoint, referring to values set in the config file (API key and model) to request summarization. - Anthropic: * Opus: `claude-3-opus-20240229` @@ -121,9 +134,9 @@ By default videos, transcriptions and summaries are stored in a folder with the - **Kobold.cpp** - **Exvllama2** - **Setting up a Local LLM Model** - 1. 3.8B/7GB base, 4GB Q8 microsoft/Phi-3-mini-128k-instruct - https://huggingface.co/microsoft/Phi-3-mini-128k-instruct + 1. microsoft/Phi-3-mini-128k-instruct - 3.8B Model/7GB base, 4GB Q8 - https://huggingface.co/microsoft/Phi-3-mini-128k-instruct * GGUF Quants: https://huggingface.co/pjh64/Phi-3-mini-128K-Instruct.gguf - 2. 8B/16GB base, 8.5GB Q8 - https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct + 2. Meta Llama3-8B - 8B Model/16GB base, 8.5GB Q8 - https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct * GGUF Quants: https://huggingface.co/lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF @@ -137,6 +150,19 @@ By default videos, transcriptions and summaries are stored in a folder with the * List of Files(can be URLs and local files mixed): `python diarize.py ./path/to/your/text_file.txt"` + +### APIs supported: +1. Anthropic +2. Cohere +3. Groq +4. Llama.cpp +5. Kobold.cpp +5. TabbyAPI +6. OpenAI +7. Oobabooga + + + ### Credits - [original](https://github.com/the-crypt-keeper/tldw) - [yt-dlp](https://github.com/yt-dlp/yt-dlp) diff --git a/diarize.py b/diarize.py index 288c1fb..cae8aa5 100644 --- a/diarize.py +++ b/diarize.py @@ -360,19 +360,79 @@ def get_youtube(video_url): -def download_video(video_url, download_path, info_dict): +def download_video(video_url, download_path, info_dict, download_video_flag): logging.debug("About to normalize downloaded video title") title = normalize_title(info_dict['title']) - file_path = os.path.join(download_path, f"{title}.m4a") - ydl_opts = { - 'format': 'bestaudio[ext=m4a]', - 'outtmpl': file_path, - } - with yt_dlp.YoutubeDL(ydl_opts) as ydl: - logging.debug("About to download video with youtube-dl") - ydl.download([video_url]) - logging.debug("Video successfully downloaded with youtube-dl") - return file_path + + if download_video_flag == False: + file_path = os.path.join(download_path, f"{title}.m4a") + ydl_opts = { + 'format': 'bestaudio[ext=m4a]', + 'outtmpl': file_path, + } + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + logging.debug("yt_dlp: About to download audio with youtube-dl") + ydl.download([video_url]) + logging.debug("yt_dlp: Audio successfully downloaded with youtube-dl") + return file_path + else: + video_file_path = os.path.join(download_path, f"{title}_video.mp4") + audio_file_path = os.path.join(download_path, f"{title}_audio.m4a") + ydl_opts_video = { + 'format': 'bestvideo[ext=mp4]', + 'outtmpl': video_file_path, + } + ydl_opts_audio = { + 'format': 'bestaudio[ext=m4a]', + 'outtmpl': audio_file_path, + } + + with yt_dlp.YoutubeDL(ydl_opts_video) as ydl: + logging.debug("yt_dlp: About to download video with youtube-dl") + ydl.download([video_url]) + logging.debug("yt_dlp: Video successfully downloaded with youtube-dl") + + with yt_dlp.YoutubeDL(ydl_opts_audio) as ydl: + logging.debug("yt_dlp: About to download audio with youtube-dl") + ydl.download([video_url]) + logging.debug("yt_dlp: Audio successfully downloaded with youtube-dl") + + output_file_path = os.path.join(download_path, f"{title}.mp4") + + if userOS == "Windows": + logging.debug("Running ffmpeg on Windows...") + ffmpeg_command = [ + '.\\Bin\\ffmpeg.exe', + '-i', video_file_path, + '-i', audio_file_path, + '-c:v', 'copy', + '-c:a', 'copy', + output_file_path + ] + subprocess.run(ffmpeg_command, check=True) + elif userOS == "Linux": + logging.debug("Running ffmpeg on Linux...") + ffmpeg_command = [ + 'ffmpeg', + '-i', video_file_path, + '-i', audio_file_path, + '-c:v', 'copy', + '-c:a', 'copy', + output_file_path + ] + subprocess.run(ffmpeg_command, check=True) + else: + logging.error("You shouldn't be here...") + exit() + os.remove(video_file_path) + os.remove(audio_file_path) + + return output_file_path + + + + + # # #################################################################################################################################### @@ -400,7 +460,7 @@ def convert_to_wav(video_file_path, offset=0): logging.debug("ffmpeg being ran on windows") if sys.platform.startswith('win'): - ffmpeg_cmd = './Bin/ffmpeg.exe' + ffmpeg_cmd = ".\\Bin\\ffmpeg.exe" else: ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems @@ -861,7 +921,7 @@ def save_summary_to_file(summary, file_path): #################################################################################################################################### # Main() # -def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model="small.en", offset=0, vad_filter=False): +def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model="small.en", offset=0, vad_filter=False, download_video_flag=False): start_time = time.monotonic() if os.path.isfile(input_path) and input_path.endswith('.txt'): logging.debug("MAIN: User passed in a text file, processing text file...") @@ -880,7 +940,7 @@ def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model= download_path = create_download_directory(info_dict['title']) logging.debug("MAIN: Path created successfully") logging.debug("MAIN: Downloading video from yt_dlp...") - video_path = download_video(path, download_path, info_dict) + video_path = download_video(path, download_path, info_dict, download_video_flag) logging.debug("MAIN: Video downloaded successfully") logging.debug("MAIN: Converting video file to WAV...") audio_file = convert_to_wav(video_path, offset) @@ -945,13 +1005,14 @@ def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model= if __name__ == "__main__": parser = argparse.ArgumentParser(description='Transcribe and summarize videos.') parser.add_argument('input_path', type=str, help='Path or URL of the video', nargs='?') - parser.add_argument('--api_name', type=str, help='API name for summarization (optional)') - parser.add_argument('--api_key', type=str, help='API key for summarization (optional)') - parser.add_argument('--num_speakers', type=int, default=2, help='Number of speakers (default: 2)') - parser.add_argument('--whisper_model', type=str, default='small.en', help='Whisper model (default: small.en)') - parser.add_argument('--offset', type=int, default=0, help='Offset in seconds (default: 0)') - parser.add_argument('--vad_filter', action='store_true', help='Enable VAD filter') - parser.add_argument('--log_level', type=str, default='INFO', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='Log level (default: INFO)') + parser.add_argument('-v','--video', action='store_true', help='Download the video instead of just the audio') + parser.add_argument('-name', '--api_name', type=str, help='API name for summarization (optional)') + parser.add_argument('-key', '--api_key', type=str, help='API key for summarization (optional)') + parser.add_argument('-ns', '--num_speakers', type=int, default=2, help='Number of speakers (default: 2)') + parser.add_argument('-wm', '--whisper_model', type=str, default='small.en', help='Whisper model (default: small.en)') + parser.add_argument('-off', '--offset', type=int, default=0, help='Offset in seconds (default: 0)') + parser.add_argument('-vad', '--vad_filter', action='store_true', help='Enable VAD filter') + parser.add_argument('-log', '--log_level', type=str, default='INFO', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='Log level (default: INFO)') #parser.add_argument('--log_file', action=str, help='Where to save logfile (non-default)') args = parser.parse_args() @@ -985,7 +1046,7 @@ if __name__ == "__main__": check_ffmpeg() try: - results = main(args.input_path, api_name=args.api_name, api_key=args.api_key, num_speakers=args.num_speakers, whisper_model=args.whisper_model, offset=args.offset, vad_filter=args.vad_filter) + results = main(args.input_path, api_name=args.api_name, api_key=args.api_key, num_speakers=args.num_speakers, whisper_model=args.whisper_model, offset=args.offset, vad_filter=args.vad_filter, download_video_flag=args.video) logging.info('Transcription process completed.') except Exception as e: logging.error('An error occurred during the transcription process.') diff --git a/list_of_videos.txt b/list_of_videos.txt new file mode 100644 index 0000000..95cf2af --- /dev/null +++ b/list_of_videos.txt @@ -0,0 +1,3 @@ +https://www.youtube.com/shorts/siPhZvKk0xE +https://www.youtube.com/shorts/oNM-YLoVMKI +https://www.youtube.com/shorts/quuWzw2Ih6M \ No newline at end of file