From 3147fdec42d99206a8fa1323e1d0d454929d1ade Mon Sep 17 00:00:00 2001 From: Robert Date: Wed, 15 May 2024 21:08:34 -0700 Subject: [PATCH] OpenAI rolling Summarization works Rolling summarization using OpenAI works, using the '-detail {0.01 - 1.00}' command --- README.md | 2 +- summarize.py | 168 ++++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 134 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index 30f1ecc..fb05d34 100644 --- a/README.md +++ b/README.md @@ -245,7 +245,7 @@ By default videos, transcriptions and summaries are stored in a folder with the ------------ ### Similar/Other projects: - https://github.com/Dicklesworthstone/bulk_transcribe_youtube_videos_from_playlist/tree/main - +- https://github.com/akashe/YoutubeSummarizer ------------ ### Credits diff --git a/summarize.py b/summarize.py index 7709e94..88c7f95 100644 --- a/summarize.py +++ b/summarize.py @@ -30,12 +30,16 @@ os.environ["GRADIO_ANALYTICS_ENABLED"] = "False" ####### # Function Sections # +# Config Loading # System Checks # Processing Paths and local file handling # Video Download/Handling # Audio Transcription # Diarization +# Chunking-related Techniques & Functions +# Tokenization-related Techniques & Functions # Summarizers +# Gradio UI # Main # ####### @@ -363,9 +367,9 @@ def process_url(url, num_speakers, whisper_model, custom_prompt, offset, api_nam video_file_path = None print("API Name received:", api_name) # Debugging line try: - results = main(url, api_name=api_name, api_key=api_key, num_speakers=num_speakers, - whisper_model=whisper_model, offset=offset, vad_filter=vad_filter, - download_video_flag=download_video, custom_prompt=custom_prompt) + results = main(url, api_name=api_name, api_key=api_key, num_speakers=num_speakers, whisper_model=whisper_model, + offset=offset, vad_filter=vad_filter, download_video_flag=download_video, + custom_prompt=custom_prompt) if results: transcription_result = results[0] @@ -558,7 +562,7 @@ def convert_to_wav(video_file_path, offset=0, overwrite=False): logging.debug("ffmpeg being ran on windows") if sys.platform.startswith('win'): - ffmpeg_cmd = "..\\Bin\\ffmpeg.exe" + ffmpeg_cmd = ".\\Bin\\ffmpeg.exe" logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}") else: ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems @@ -781,19 +785,33 @@ def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='sm # +# This is dirty and shameful and terrible. It should be replaced with a proper implementation. +# anyways lets get to it.... +client = OpenAI(api_key=openai_api_key) +def get_chat_completion(messages, model='gpt-4-turbo'): + response = client.chat.completions.create( + model=model, + messages=messages, + temperature=0, + ) + return response.choices[0].message.content + + # This function chunks a text into smaller pieces based on a maximum token count and a delimiter def chunk_on_delimiter(input_string: str, max_tokens: int, delimiter: str) -> List[str]: chunks = input_string.split(delimiter) - combined_chunks, _, dropped_chunk_count = combine_chunks_with_no_minimum(chunks, max_tokens, chunk_delimiter=delimiter, add_ellipsis_for_overflow=True) + combined_chunks, _, dropped_chunk_count = combine_chunks_with_no_minimum( + chunks, max_tokens, chunk_delimiter=delimiter, add_ellipsis_for_overflow=True) if dropped_chunk_count > 0: print(f"Warning: {dropped_chunk_count} chunks were dropped due to exceeding the token limit.") combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks] return combined_chunks -# This function combines chunks into larger pieces based on a maximum token count +# This function combines text chunks into larger blocks without exceeding a specified token count. +# It returns the combined chunks, their original indices, and the number of dropped chunks due to overflow. def combine_chunks_with_no_minimum( chunks: List[str], max_tokens: int, @@ -810,16 +828,19 @@ def combine_chunks_with_no_minimum( candidate_indices = [] for chunk_i, chunk in enumerate(chunks): chunk_with_header = [chunk] if header is None else [header, chunk] + #FIXME MAKE NOT OPENAI SPECIFIC if len(openai_tokenize(chunk_delimiter.join(chunk_with_header))) > max_tokens: print(f"warning: chunk overflow") if ( add_ellipsis_for_overflow + # FIXME MAKE NOT OPENAI SPECIFIC and len(openai_tokenize(chunk_delimiter.join(candidate + ["..."]))) <= max_tokens ): candidate.append("...") dropped_chunk_count += 1 continue # this case would break downstream assumptions # estimate token count with the current chunk added + # FIXME MAKE NOT OPENAI SPECIFIC extended_candidate_token_count = len(openai_tokenize(chunk_delimiter.join(candidate + [chunk]))) # If the token count exceeds max_tokens, add the current candidate to output and start a new candidate if extended_candidate_token_count > max_tokens: @@ -837,7 +858,8 @@ def combine_chunks_with_no_minimum( output_indices.append(candidate_indices) return output, output_indices, dropped_chunk_count -def openai_chunk_summarize(text: str, + +def rolling_summarize(text: str, detail: float = 0, model: str = 'gpt-4-turbo', additional_instructions: Optional[str] = None, @@ -877,11 +899,13 @@ def openai_chunk_summarize(text: str, num_chunks = int(min_chunks + detail * (max_chunks - min_chunks)) # adjust chunk_size based on interpolated number of chunks + # FIXME MAKE NOT OPENAI SPECIFIC document_length = len(openai_tokenize(text)) chunk_size = max(minimum_chunk_size, document_length // num_chunks) text_chunks = chunk_on_delimiter(text, chunk_size, chunk_delimiter) if verbose: print(f"Splitting the text into {len(text_chunks)} chunks to be summarized.") + # FIXME MAKE NOT OPENAI SPECIFIC print(f"Chunk lengths are {[len(openai_tokenize(x)) for x in text_chunks]}") # set system message @@ -894,8 +918,7 @@ def openai_chunk_summarize(text: str, if summarize_recursively and accumulated_summaries: # Creating a structured prompt for recursive summarization accumulated_summaries_string = '\n\n'.join(accumulated_summaries) - user_message_content = (f"Previous summaries:\n\n{accumulated_summaries_string}\n\nText to summarize " - f"next:\n\n{chunk}") + user_message_content = f"Previous summaries:\n\n{accumulated_summaries_string}\n\nText to summarize next:\n\n{chunk}" else: # Directly passing the chunk for summarization without recursive context user_message_content = chunk @@ -911,6 +934,7 @@ def openai_chunk_summarize(text: str, accumulated_summaries.append(response) # Compile final summary from partial summaries + global final_summary final_summary = '\n\n'.join(accumulated_summaries) return final_summary @@ -930,20 +954,6 @@ def openai_tokenize(text: str) -> List[str]: return encoding.encode(text) # openai summarize chunks -def summarize_with_detail(detail, final_summary): - openai_summarization = openai_chunk_summarize(final_summary, detail=detail, verbose=True) - return openai_summarization - - -def get_chat_completion(messages, model='gpt-4-turbo'): - client = OpenAI(api_key="") - response = client.chat.completions.create( - model=model, - messages=messages, - temperature=0, - ) - return response.choices[0].message.content - # # @@ -956,6 +966,7 @@ def get_chat_completion(messages, model='gpt-4-turbo'): # # + def extract_text_from_segments(segments): logging.debug(f"Main: extracting text from {segments}") text = ' '.join([segment['text'] for segment in segments]) @@ -1349,6 +1360,26 @@ def save_summary_to_file(summary, file_path): ####################################################################################################################### +####################################################################################################################### +# Summarization with Detail +# + +def summarize_with_detail_openai(text, detail, verbose=False): + # FIXME MAKE function not specific to the artifiical intelligence example + summary_with_detail_variable = rolling_summarize(text, detail=detail, verbose=True) + print(len(openai_tokenize(summary_with_detail_variable))) + return summary_with_detail_variable + + +def summarize_with_detail_recursive_openai(text, detail, verbose=False): + summary_with_recursive_summarization = rolling_summarize(text, detail=detail, summarize_recursively=True) + print(summary_with_recursive_summarization) + +# +# +####################################################################################################################### + + ####################################################################################################################### # Gradio UI # @@ -1539,8 +1570,9 @@ def launch_ui(demo_mode=False): # Main() # def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model="small.en", offset=0, vad_filter=False, - download_video_flag=False, demo_mode=False, custom_prompt=None, overwrite=False): - global summary + download_video_flag=False, demo_mode=False, custom_prompt=None, overwrite=False, + rolling_summarization=None, detail=0.01): + global summary, audio_file if input_path is None and args.user_interface: return [] start_time = time.monotonic() @@ -1568,6 +1600,7 @@ def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model= if path.startswith('http'): logging.debug("MAIN: URL Detected") info_dict = get_youtube(path) + json_file_path = None if info_dict: logging.debug("MAIN: Creating path for video file...") download_path = create_download_directory(info_dict['title']) @@ -1599,10 +1632,43 @@ def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model= 'transcription': segments } results.append(transcription_result) - logging.info(f"Transcription complete: {audio_file}") + logging.info(f"MAIN: Transcription complete: {audio_file}") + # Perform rolling summarization based on API Name, detail level, and if an API key exists + # Will remove the API key once rolling is added for llama.cpp + if rolling_summarization: + logging.info("MAIN: Rolling Summarization") + + # Extract the text from the segments + text = extract_text_from_segments(segments) + + # Set the json_file_path + json_file_path = audio_file.replace('.wav', '.segments.json') + + # Perform rolling summarization + summary = summarize_with_detail_openai(text, detail=args.detail_level, verbose=False) + + # Handle the summarized output + if summary: + transcription_result['summary'] = summary + logging.info("MAIN: Rolling Summarization successful.") + save_summary_to_file(summary, json_file_path) + else: + logging.warning("MAIN: Rolling Summarization failed.") + + # if api_name and api_key: + # logging.debug(f"MAIN: Rolling summarization being performed by {api_name}") + # json_file_path = audio_file.replace('.wav', '.segments.json') + # if api_name.lower() == 'openai': + # openai_api_key = api_key if api_key else config.get('API', 'openai_api_key', + # fallback=None) + # try: + # logging.debug(f"MAIN: trying to summarize with openAI") + # summary = (openai_api_key, json_file_path, openai_model, custom_prompt) + # except requests.exceptions.ConnectionError: + # requests.status_code = "Connection: " # Perform summarization based on the specified API - if api_name and api_key: + elif api_name and api_key: logging.debug(f"MAIN: Summarization being performed by {api_name}") json_file_path = audio_file.replace('.wav', '.segments.json') if api_name.lower() == 'openai': @@ -1677,6 +1743,10 @@ def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model= transcription_result['summary'] = summary logging.info(f"Summary generated using {api_name} API") save_summary_to_file(summary, json_file_path) + elif final_summary: + logging.info(f"Rolling summary generated using {api_name} API") + logging.info(f"Final Rolling summary is {final_summary}\n\n") + save_summary_to_file(final_summary, json_file_path) else: logging.warning(f"Failed to generate summary using {api_name} API") else: @@ -1707,8 +1777,16 @@ if __name__ == "__main__": parser.add_argument('-ui', '--user_interface', action='store_true', help="Launch the Gradio user interface") parser.add_argument('-demo', '--demo_mode', action='store_true', help='Enable demo mode') parser.add_argument('-prompt', '--custom_prompt', type=str, - help='Pass in a custom prompt to be used in place of the existing one.(Probably should just ' + help='Pass in a custom prompt to be used in place of the existing one.\n (Probably should just ' 'modify the script itself...)') + parser.add_argument('-overwrite', '--overwrite', action='store_true', help='Overwrite existing files') + parser.add_argument('-roll', '--rolling_summarization', action='store_true', help='Enable rolling summarization') + parser.add_argument('-detail', '--detail_level', type=float, help='Mandatory if rolling summarization is enabled, ' + 'defines the chunk size.\n Default is 0.01(lots ' + 'of chunks) -> 1.00 (few chunks)\n Currently ' + 'only OpenAI works. ', + default=0.01,) + # parser.add_argument('-o', '--output_path', type=str, help='Path to save the output file') # parser.add_argument('--log_file', action=str, help='Where to save logfile (non-default)') args = parser.parse_args() @@ -1740,20 +1818,38 @@ if __name__ == "__main__": logging.info('Starting the transcription and summarization process.') logging.info(f'Input path: {args.input_path}') logging.info(f'API Name: {args.api_name}') - logging.debug(f'API Key: {args.api_key}') # ehhhhh logging.info(f'Number of speakers: {args.num_speakers}') logging.info(f'Whisper model: {args.whisper_model}') logging.info(f'Offset: {args.offset}') logging.info(f'VAD filter: {args.vad_filter}') logging.info(f'Log Level: {args.log_level}') # lol + logging.info(f'Demo Mode: {args.demo_mode}') + logging.info(f'Custom Prompt: {args.custom_prompt}') + logging.info(f'Overwrite: {args.overwrite}') + logging.info(f'Rolling Summarization: {args.rolling_summarization}') + logging.info(f'User Interface: {args.user_interface}') + logging.info(f'Video Download: {args.video}') + # logging.info(f'Save File location: {args.output_path}') + # logging.info(f'Log File location: {args.log_file}') + + # Get all API keys from the config + api_keys = {key: value for key, value in config.items('API') if key.endswith('_api_key')} + + # Rolling Summarization will only be performed if an API is specified and the API key is available + # and the rolling summarization flag is set + # + summary = None # Initialize to ensure it's always defined + if args.api_name and args.rolling_summarization and any( + key.startswith(args.api_name) and value is not None for key, value in api_keys.items()): + logging.info(f'MAIN: API used: {args.api_name}') + logging.info('MAIN: Rolling Summarization will be performed.') + + elif args.api_name: + logging.info(f'MAIN: API used: {args.api_name}') + logging.info('MAIN: Summarization (not rolling) will be performed.') - if args.api_name and args.api_key: - logging.info(f'API: {args.api_name}') - logging.info('Summarization will be performed.') - summary = None # Initialize to ensure it's always defined else: logging.info('No API specified. Summarization will not be performed.') - summary = None # Initialize to ensure it's always defined logging.debug("Platform check being performed...") platform_check() @@ -1765,7 +1861,9 @@ if __name__ == "__main__": try: results = main(args.input_path, api_name=args.api_name, api_key=args.api_key, num_speakers=args.num_speakers, whisper_model=args.whisper_model, offset=args.offset, - vad_filter=args.vad_filter, download_video_flag=args.video, overwrite=args.overwrite) + vad_filter=args.vad_filter, download_video_flag=args.video, overwrite=args.overwrite, + rolling_summarization=args.rolling_summarization, custom_prompt=args.custom_prompt, + demo_mode=args.demo_mode, detail=args.detail_level) logging.info('Transcription process completed.') except Exception as e: logging.error('An error occurred during the transcription process.')