From bb1acc42b916d996e5f35cef60a890029691eef1 Mon Sep 17 00:00:00 2001 From: Robert Date: Sat, 11 May 2024 22:25:29 -0700 Subject: [PATCH] More app.py fixes... --- .gitignore | Bin 8562 -> 8578 bytes HF/app.py | 167 +++++++++++++++++++++++++---------- Long_Summarize_openai.py | 184 +++++++++++++++++++++++++++++++++++++-- summarize.py | 5 +- 4 files changed, 300 insertions(+), 56 deletions(-) diff --git a/.gitignore b/.gitignore index a002b6dc8b66787ab76b28117f5038bac0f052c8..ca6e07786f8d7e08c720dbc18992c7962f4925ba 100644 GIT binary patch delta 28 icmez5)a1($prwyISYXR delta 11 ScmZp2{^Z2;|KCQo0!08JrUgL& diff --git a/HF/app.py b/HF/app.py index 6fb0230..9303ed9 100644 --- a/HF/app.py +++ b/HF/app.py @@ -39,7 +39,7 @@ import yt_dlp # 2. Usage of/Hardcoding HF_TOKEN as token for API calls # 3. Usage of HuggingFace for Inference # 4. Other stuff I can't remember. Will eventually do a diff and document them. -# +# #### @@ -63,10 +63,10 @@ import yt_dlp # llama.cpp)/`ooba` (oobabooga/text-gen-webui)/`kobold` (kobold.cpp)/`tabby` (Tabbyapi)) API:** python summarize.py # -v https://www.youtube.com/watch?v=4nd1CDZP21s -api ` - Make sure to put your API key into # `config.txt` under the appropriate API variable -# +# # Download Audio+Video from a list of videos in a text file (can be file paths or URLs) and have them all summarized:** # python summarize.py ./local/file_on_your/system --api_name ` -# +# # Run it as a WebApp** # python summarize.py -gui` - This requires you to either stuff your API keys into the `config.txt` file, or pass them into the app every time you want to use it. # Can be helpful for setting up a shared instance, but not wanting people to perform inference on your server. @@ -120,7 +120,7 @@ output_path = config.get('Paths', 'output_path', fallback='results') processing_choice = config.get('Processing', 'processing_choice', fallback='cpu') # Log file -#logging.basicConfig(filename='debug-runtime.log', encoding='utf-8', level=logging.DEBUG) +# logging.basicConfig(filename='debug-runtime.log', encoding='utf-8', level=logging.DEBUG) # # @@ -148,8 +148,8 @@ print(r""" | | | | / / | | | || |/\| | | | | |____ / / | |/ / \ /\ / _ \_/ \_____//_/ |___/ \/ \/ (_) - - + + _ _ | | | | | |_ ___ ___ | | ___ _ __ __ _ @@ -168,8 +168,8 @@ print(r""" #################################################################################################################################### # System Checks -# -# +# +# # Perform Platform Check userOS = "" @@ -291,13 +291,13 @@ def download_ffmpeg(): # -# +# #################################################################################################################################### #################################################################################################################################### # Processing Paths and local file handling -# +# # def read_paths_from_file(file_path): @@ -374,7 +374,7 @@ def process_url(input_path, num_speakers=2, whisper_model="small.en", custom_pro return json_data, summary_file_path, json_file_path, summary_file_path else: - return json_data, "Summary not available.", json_file_path, None + return json_data, "Summary not available.", json_file_path, "Summary not available." else: return None, "No results found.", None, None @@ -508,8 +508,8 @@ def download_video(video_url, download_path, info_dict, download_video_flag): ] subprocess.run(ffmpeg_command, check=True) else: - logging.error("You shouldn't be here...") - exit() + logging.error("ffmpeg: Unsupported operating system for video download and merging.") + raise RuntimeError("ffmpeg: Unsupported operating system for video download and merging.") os.remove(video_file_path) os.remove(audio_file_path) @@ -529,7 +529,7 @@ def download_video(video_url, download_path, info_dict, download_video_flag): # https://www.gyan.dev/ffmpeg/builds/ # -#os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"') +# os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"') def convert_to_wav(video_file_path, offset=0): print("Starting conversion process of .m4a to .WAV") out_path = os.path.splitext(video_file_path)[0] + ".wav" @@ -539,7 +539,8 @@ def convert_to_wav(video_file_path, offset=0): logging.debug("ffmpeg being ran on windows") if sys.platform.startswith('win'): - ffmpeg_cmd = ".\\Bin\\ffmpeg.exe" + ffmpeg_cmd = "..\\Bin\\ffmpeg.exe" + logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}") else: ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems @@ -749,7 +750,7 @@ def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='sm #################################################################################################################################### -#Summarizers +# Summarizers # # @@ -1023,7 +1024,7 @@ def summarize_with_llama(api_url, file_path, token, custom_prompt): logging.debug("API Response Data: %s", response_data) if response.status_code == 200: - #if 'X' in response_data: + # if 'X' in response_data: logging.debug(response_data) summary = response_data['content'].strip() logging.debug("llama: Summarization successful") @@ -1236,28 +1237,11 @@ def process_text(api_key, text_file): return "Notice:", message +def format_file_path(file_path): + # Helper function to check file existence and return an appropriate path or message + return file_path if file_path and os.path.exists(file_path) else None + def launch_ui(demo_mode=False): - def process_url(url, num_speakers, whisper_model, custom_prompt, offset, api_name, api_key, vad_filter, - download_video): - try: - # Assuming 'main' is the function that handles the processing logic. - # Adjust parameters as needed based on your actual 'main' function implementation. - results = main(url, api_name=api_name, api_key=api_key, num_speakers=num_speakers, - whisper_model=whisper_model, offset=offset, vad_filter=vad_filter, - download_video_flag=download_video, custom_prompt=custom_prompt) - - if results: - transcription_result = results[0] - json_data = transcription_result['transcription'] - summary_file_path = transcription_result.get('summary', "Summary not available.") - json_file_path = transcription_result['audio_file'].replace('.wav', '.segments.json') - video_file_path = transcription_result.get('video_path', None) - return json_data, summary_file_path, json_file_path, summary_file_path, video_file_path - else: - return "No results found.", "No summary available.", None, None, None - except Exception as e: - return str(e), "Error processing the request.", None, None, None - inputs = [ gr.components.Textbox(label="URL", placeholder="Enter the video URL here"), gr.components.Number(value=2, label="Number of Speakers"), @@ -1275,8 +1259,90 @@ def launch_ui(demo_mode=False): outputs = [ gr.components.Textbox(label="Transcription"), gr.components.Textbox(label="Summary or Status Message"), - gr.components.File(label="Download Transcription as JSON", visible=lambda x: x is not None), - gr.components.File(label="Download Summary as Text", visible=lambda x: x is not None), + gr.components.File(label="Download Transcription as JSON", visible=lambda x: x != "File not available"), + gr.components.File(label="Download Summary as Text", visible=lambda x: x != "File not available"), + gr.components.File(label="Download Video", visible=lambda x: x is not None) + ] + + def process_url(url, num_speakers, whisper_model, custom_prompt, offset, api_name, api_key, vad_filter, + download_video): + video_file_path = None + try: + results = main(url, api_name=api_name, api_key=api_key, num_speakers=num_speakers, + whisper_model=whisper_model, offset=offset, vad_filter=vad_filter, + download_video_flag=download_video, custom_prompt=custom_prompt) + if results: + transcription_result = results[0] + json_file_path = transcription_result['audio_file'].replace('.wav', '.segments.json') + summary_file_path = json_file_path.replace('.segments.json', '_summary.txt') + + json_file_path = format_file_path(json_file_path) + summary_file_path = format_file_path(summary_file_path) + + return transcription_result['transcription'], "Summary available", json_file_path, summary_file_path, video_file_path + else: + return "No results found.", "No summary available.", None, None + except Exception as e: + return str(e), "Error processing the request.", None, None + + iface = gr.Interface( + fn=process_url, + inputs=inputs, + outputs=outputs, + title="Video Transcription and Summarization", + description="Submit a video URL for transcription and summarization. Ensure you input all necessary information including API keys." + ) + + iface.launch(share=False) + + + + +a = """def launch_ui(demo_mode=False): + def process_url(url, num_speakers, whisper_model, custom_prompt, offset, api_name, api_key, vad_filter, + download_video): + try: + results = main(url, api_name=api_name, api_key=api_key, num_speakers=num_speakers, + whisper_model=whisper_model, offset=offset, vad_filter=vad_filter, + download_video_flag=download_video, custom_prompt=custom_prompt) + + if results: + transcription_result = results[0] + json_data = transcription_result['transcription'] + json_file_path = transcription_result['audio_file'].replace('.wav', '.segments.json') + summary_file_path = transcription_result.get('summary', "Summary not available.") + video_file_path = transcription_result.get('video_path', None) + + json_file_path = format_file_path(json_file_path) + summary_file_path = format_file_path(summary_file_path) + + return json_data, "Summary available", json_file_path, summary_file_path, video_file_path + else: + return "No results found.", "No summary available.", None, None, None + except Exception as e: + return str(e), "Error processing the request.", None, None, None, None + + inputs = [ + gr.components.Textbox(label="URL", placeholder="Enter the video URL here"), + gr.components.Number(value=2, label="Number of Speakers"), + gr.components.Dropdown(choices=whisper_models, value="small.en", label="Whisper Model"), + gr.components.Textbox(label="Custom Prompt", + placeholder="Q: As a professional summarizer, create a concise and comprehensive summary of the provided text.\nA: Here is a detailed, bulleted list of the key points made in the transcribed video and supporting arguments:", + lines=3), + gr.components.Number(value=0, label="Offset"), + gr.components.Dropdown( + choices=["huggingface", "openai", "anthropic", "cohere", "groq", "llama", "kobold", "ooba"], + label="API Name"), + gr.components.Textbox(label="API Key", placeholder="Enter your API key here"), + gr.components.Checkbox(label="VAD Filter", value=False), + gr.components.Checkbox(label="Download Video", value=False) + ] + + outputs = [ + gr.components.Textbox(label="Transcription"), + gr.components.Textbox(label="Summary or Status Message"), + gr.components.File(label="Download Transcription as JSON", visible=lambda x: x != "File not available"), + gr.components.File(label="Download Summary as Text", visible=lambda x: x != "File not available"), gr.components.File(label="Download Video", visible=lambda x: x is not None) ] @@ -1290,7 +1356,7 @@ def launch_ui(demo_mode=False): ) iface.launch(share=False) - +""" # # @@ -1332,7 +1398,12 @@ def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model= download_path = create_download_directory(info_dict['title']) logging.debug("MAIN: Path created successfully") logging.debug("MAIN: Downloading video from yt_dlp...") - video_path = download_video(path, download_path, info_dict, download_video_flag) + try: + video_path = download_video(path, download_path, info_dict, download_video_flag) + except RuntimeError as e: + logging.error(f"Error downloading video: {str(e)}") + #FIXME - figure something out for handling this situation.... + continue logging.debug("MAIN: Video downloaded successfully") logging.debug("MAIN: Converting video file to WAV...") audio_file = convert_to_wav(video_path, offset) @@ -1436,7 +1507,7 @@ def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model= logging.error(f"Error processing path: {path}") logging.error(str(e)) end_time = time.monotonic() - #print("Total program execution time: " + timedelta(seconds=end_time - start_time)) + # print("Total program execution time: " + timedelta(seconds=end_time - start_time)) return results @@ -1455,7 +1526,9 @@ if __name__ == "__main__": choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='Log level (default: INFO)') parser.add_argument('-ui', '--user_interface', action='store_true', help='Launch the Gradio user interface') parser.add_argument('-demo', '--demo_mode', action='store_true', help='Enable demo mode') - #parser.add_argument('--log_file', action=str, help='Where to save logfile (non-default)') + parser.add_argument('-prompt', '--custom_prompt', type=str, + help='Pass in a custom prompt to be used in place of the existing one.(Probably should just modify the script itself...)') + # parser.add_argument('--log_file', action=str, help='Where to save logfile (non-default)') args = parser.parse_args() custom_prompt = args.custom_prompt @@ -1467,9 +1540,9 @@ if __name__ == "__main__": args.custom_prompt = "\n\nQ: As a professional summarizer, create a concise and comprehensive summary of the provided text.\nA: Here is a detailed, bulleted list of the key points made in the transcribed video and supporting arguments:" print("No custom prompt defined, will use default") - print(f"Is CUDA available: {torch.cuda.is_available()}") + # print(f"Is CUDA available: {torch.cuda.is_available()}") # True - print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}") + # print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}") # Tesla T4 # Since this is running in HF.... @@ -1491,7 +1564,7 @@ if __name__ == "__main__": logging.info(f'Whisper model: {args.whisper_model}') logging.info(f'Offset: {args.offset}') logging.info(f'VAD filter: {args.vad_filter}') - logging.info(f'Log Level: {args.log_level}') #lol + logging.info(f'Log Level: {args.log_level}') # lol if args.api_name and args.api_key: logging.info(f'API: {args.api_name}') diff --git a/Long_Summarize_openai.py b/Long_Summarize_openai.py index 0a9d600..7ae4238 100644 --- a/Long_Summarize_openai.py +++ b/Long_Summarize_openai.py @@ -1,10 +1,11 @@ -import os from typing import List, Tuple, Optional from openai import OpenAI import tiktoken from tqdm import tqdm +# script from: https://github.com/openai/openai-cookbook/blob/main/examples/Summarizing_long_documents.ipynb + # Open dataset with open(".\\tldw-original-scripts\\Samples\\ai_wikipedia.txt", "r") as file: @@ -14,15 +15,15 @@ with open(".\\tldw-original-scripts\\Samples\\ai_wikipedia.txt", "r") as file: encoding = tiktoken.encoding_for_model('gpt-4-turbo') print(len(encoding.encode(artificial_intelligence))) - # Call wrapper to OpenAI client = OpenAI(api_key="") + def get_chat_completion(messages, model='gpt-4-turbo'): response = client.chat.completions.create( - model = model, - messages = messages, - temperature = 0, + model=model, + messages=messages, + temperature=0, ) return response.choices[0].message.content @@ -32,6 +33,7 @@ def tokenize(text: str) -> List[str]: encoding = tiktoken.encoding_for_model('gpt-4-turbo') return encoding.encode(text) + # This function chunks a text into smaller pieces based on a maximum token count and a delimiter def chunk_on_delimiter(input_string: str, max_tokens: int, @@ -45,13 +47,181 @@ def chunk_on_delimiter(input_string: str, combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks] return combined_chunks + # This function combines text chunks into larger blocks without exceeding a specified token count. # It returns the combined chunks, their original indices, and the number of dropped chunks due to overflow. def combine_chunks_with_no_minimum( chunks: List[str], max_tokens: int, - chunk_delimiter: str = "\n\n", + chunk_delimiter="\n\n", header: Optional[str] = None, - add_ellipsis_for_overflow: bool = False, + add_ellipsis_for_overflow=False, +) -> Tuple[List[str], List[int]]: + dropped_chunk_count = 0 + output = [] # list to hold the final combined chunks + output_indices = [] # list to hold the indices of the final combined chunks + candidate = ( + [] if header is None else [header] + ) # list to hold the current combined chunk candidate + candidate_indices = [] + for chunk_i, chunk in enumerate(chunks): + chunk_with_header = [chunk] if header is None else [header, chunk] + if len(tokenize(chunk_delimiter.join(chunk_with_header))) > max_tokens: + print(f"warning: chunk overflow") + if ( + add_ellipsis_for_overflow + and len(tokenize(chunk_delimiter.join(candidate + ["..."]))) <= max_tokens + ): + candidate.append("...") + dropped_chunk_count += 1 + continue # this case would break downstream assumptions + # estimate token count with the current chunk added + extended_candidate_token_count = len(tokenize(chunk_delimiter.join(candidate + [chunk]))) + # If the token count exceeds max_tokens, add the current candidate to output and start a new candidate + if extended_candidate_token_count > max_tokens: + output.append(chunk_delimiter.join(candidate)) + output_indices.append(candidate_indices) + candidate = chunk_with_header # re-initialize candidate + candidate_indices = [chunk_i] + # otherwise keep extending the candidate + else: + candidate.append(chunk) + candidate_indices.append(chunk_i) + # add the remaining candidate to output if it's not empty + if (header is not None and len(candidate) > 1) or (header is None and len(candidate) > 0): + output.append(chunk_delimiter.join(candidate)) + output_indices.append(candidate_indices) + return output, output_indices, dropped_chunk_count +def summarize(text: str, + detail: float = 0, + model: str = 'gpt-4-turbo', + additional_instructions: Optional[str] = None, + minimum_chunk_size: Optional[int] = 500, + chunk_delimiter: str = ".", + summarize_recursively=False, + verbose=False): + """ + Summarizes a given text by splitting it into chunks, each of which is summarized individually. + The level of detail in the summary can be adjusted, and the process can optionally be made recursive. + + Parameters: - text (str): The text to be summarized. - detail (float, optional): A value between 0 and 1 + indicating the desired level of detail in the summary. 0 leads to a higher level summary, and 1 results in a more + detailed summary. Defaults to 0. - model (str, optional): The model to use for generating summaries. Defaults to + 'gpt-3.5-turbo'. - additional_instructions (Optional[str], optional): Additional instructions to provide to the + model for customizing summaries. - minimum_chunk_size (Optional[int], optional): The minimum size for text + chunks. Defaults to 500. - chunk_delimiter (str, optional): The delimiter used to split the text into chunks. + Defaults to ".". - summarize_recursively (bool, optional): If True, summaries are generated recursively, + using previous summaries for context. - verbose (bool, optional): If True, prints detailed information about the + chunking process. + + Returns: + - str: The final compiled summary of the text. + + The function first determines the number of chunks by interpolating between a minimum and a maximum chunk count + based on the `detail` parameter. It then splits the text into chunks and summarizes each chunk. If + `summarize_recursively` is True, each summary is based on the previous summaries, adding more context to the + summarization process. The function returns a compiled summary of all chunks. + """ + + # check detail is set correctly + assert 0 <= detail <= 1 + + # interpolate the number of chunks based to get specified level of detail + max_chunks = len(chunk_on_delimiter(text, minimum_chunk_size, chunk_delimiter)) + min_chunks = 1 + num_chunks = int(min_chunks + detail * (max_chunks - min_chunks)) + + # adjust chunk_size based on interpolated number of chunks + document_length = len(tokenize(text)) + chunk_size = max(minimum_chunk_size, document_length // num_chunks) + text_chunks = chunk_on_delimiter(text, chunk_size, chunk_delimiter) + if verbose: + print(f"Splitting the text into {len(text_chunks)} chunks to be summarized.") + print(f"Chunk lengths are {[len(tokenize(x)) for x in text_chunks]}") + + # set system message + system_message_content = "Rewrite this text in summarized form." + if additional_instructions is not None: + system_message_content += f"\n\n{additional_instructions}" + + accumulated_summaries = [] + for chunk in tqdm(text_chunks): + if summarize_recursively and accumulated_summaries: + # Creating a structured prompt for recursive summarization + accumulated_summaries_string = '\n\n'.join(accumulated_summaries) + user_message_content = f"Previous summaries:\n\n{accumulated_summaries_string}\n\nText to summarize next:\n\n{chunk}" + else: + # Directly passing the chunk for summarization without recursive context + user_message_content = chunk + + # Constructing messages based on whether recursive summarization is applied + messages = [ + {"role": "system", "content": system_message_content}, + {"role": "user", "content": user_message_content} + ] + + # Assuming this function gets the completion and works as expected + response = get_chat_completion(messages, model=model) + accumulated_summaries.append(response) + + # Compile final summary from partial summaries + final_summary = '\n\n'.join(accumulated_summaries) + + return final_summary + +# Summary at 0 detail +summary_with_detail_0 = summarize(artificial_intelligence, detail=0, verbose=True) + + +# Summary at 0.25 detail +summary_with_detail_pt25 = summarize(artificial_intelligence, detail=0.25, verbose=True) + + +# Summary at 0.5 detail +summary_with_detail_pt5 = summarize(artificial_intelligence, detail=0.5, verbose=True) + + +# Summary at 0.75 detail +summary_with_detail_pt75 = summarize(artificial_intelligence, detail=0.75, verbose=True) + + +# Summart at 1 detail +summary_with_detail_1 = summarize(artificial_intelligence, detail=1, verbose=True) + + +# Lengths of summaries: +[len(tokenize(x)) for x in + [summary_with_detail_0, summary_with_detail_pt25, summary_with_detail_pt5, summary_with_detail_pt75, summary_with_detail_1]] + +# print 0 detail summary +print(summary_with_detail_0) + + +# print 0.25 detail summary +print(summary_with_detail_pt25) + + +# print 0.5 detail summary +print(summary_with_detail_pt5) + + +# print 0.75 detail summary +print(summary_with_detail_pt75) + + +# print 1.0 detail summary +print(summary_with_detail_1) + + +# Print summary using additional instructions: +summary_with_additional_instructions = summarize(artificial_intelligence_wikipedia_text, detail=0.1, + additional_instructions="Write in point form and focus on numerical data.") +print(summary_with_additional_instructions) + + +# Print summary using recursive summarization: +recursive_summary = summarize(artificial_intelligence_wikipedia_text, detail=0.1, summarize_recursively=True) +print(recursive_summary) + diff --git a/summarize.py b/summarize.py index 633d10b..01c754d 100644 --- a/summarize.py +++ b/summarize.py @@ -500,8 +500,8 @@ def download_video(video_url, download_path, info_dict, download_video_flag): ] subprocess.run(ffmpeg_command, check=True) else: - logging.error("You shouldn't be here...") - exit() + logging.error("ffmpeg: Unsupported operating system for video download and merging.") + raise RuntimeError("ffmpeg: Unsupported operating system for video download and merging.") os.remove(video_file_path) os.remove(audio_file_path) @@ -533,6 +533,7 @@ def convert_to_wav(video_file_path, offset=0): if sys.platform.startswith('win'): ffmpeg_cmd = ".\\Bin\\ffmpeg.exe" + logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}") else: ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems