And its back to working.

Ok. So, rolling summary works with chatGPT via CLI. Passing in a list works via CLI. Summarization of said list works via CLI. Demo GUI works. Dark/Light mode toggle does not work. Simple/Advanced mode toggle does not work. Detail slider in the GUI does not work. No current option for rolling summarization in the GUI. Lack of 're-summarize/ask a question about the transcription' box in the GUI.
2026-03-10 00:41:17 +00:00 · 2024-05-15 22:15:36 -07:00
parent 3147fdec42
commit c82fac1a82
2 changed files with 10 additions and 238 deletions
--- a/Long_Summarize_openai.py
+++ b/Long_Summarize_openai.py
@@ -1,227 +0,0 @@
-from typing import List, Tuple, Optional
-from openai import OpenAI
-import tiktoken
-from tqdm import tqdm
-
-
-# script from: https://github.com/openai/openai-cookbook/blob/main/examples/Summarizing_long_documents.ipynb
-
-
-# Open dataset
-with open(".\\tldw-original-scripts\\Samples\\ai_wikipedia.txt", "r") as file:
-    artificial_intelligence = file.read()
-
-# load encoding and check length of dataset
-encoding = tiktoken.encoding_for_model('gpt-4-turbo')
-print(len(encoding.encode(artificial_intelligence)))
-
-# Call wrapper to OpenAI
-client = OpenAI(api_key="")
-
-
-def get_chat_completion(messages, model='gpt-4-turbo'):
-    response = client.chat.completions.create(
-        model=model,
-        messages=messages,
-        temperature=0,
-    )
-    return response.choices[0].message.content
-
-
-# Message Chunking <----- THE JUICY STUFF
-def tokenize(text: str) -> List[str]:
-    encoding = tiktoken.encoding_for_model('gpt-4-turbo')
-    return encoding.encode(text)
-
-
-# This function chunks a text into smaller pieces based on a maximum token count and a delimiter
-def chunk_on_delimiter(input_string: str,
-                       max_tokens: int,
-                       delimiter: str) -> List[str]:
-    chunks = input_string.split(delimiter)
-    combined_chunks, _, dropped_chunk_count = combine_chunks_with_no_minimum(
-        chunks, max_tokens, chunk_delimiter=delimiter, add_ellipsis_for_overflow=True
-    )
-    if dropped_chunk_count > 0:
-        print(f"Warning: {dropped_chunk_count} chunks were dropped due to exceeding the token limit.")
-    combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks]
-    return combined_chunks
-
-
-# This function combines text chunks into larger blocks without exceeding a specified token count.
-#   It returns the combined chunks, their original indices, and the number of dropped chunks due to overflow.
-def combine_chunks_with_no_minimum(
-        chunks: List[str],
-        max_tokens: int,
-        chunk_delimiter="\n\n",
-        header: Optional[str] = None,
-        add_ellipsis_for_overflow=False,
-) -> Tuple[List[str], List[int]]:
-    dropped_chunk_count = 0
-    output = []  # list to hold the final combined chunks
-    output_indices = []  # list to hold the indices of the final combined chunks
-    candidate = (
-        [] if header is None else [header]
-    )  # list to hold the current combined chunk candidate
-    candidate_indices = []
-    for chunk_i, chunk in enumerate(chunks):
-        chunk_with_header = [chunk] if header is None else [header, chunk]
-        if len(tokenize(chunk_delimiter.join(chunk_with_header))) > max_tokens:
-            print(f"warning: chunk overflow")
-            if (
-                    add_ellipsis_for_overflow
-                    and len(tokenize(chunk_delimiter.join(candidate + ["..."]))) <= max_tokens
-            ):
-                candidate.append("...")
-                dropped_chunk_count += 1
-            continue  # this case would break downstream assumptions
-        # estimate token count with the current chunk added
-        extended_candidate_token_count = len(tokenize(chunk_delimiter.join(candidate + [chunk])))
-        # If the token count exceeds max_tokens, add the current candidate to output and start a new candidate
-        if extended_candidate_token_count > max_tokens:
-            output.append(chunk_delimiter.join(candidate))
-            output_indices.append(candidate_indices)
-            candidate = chunk_with_header  # re-initialize candidate
-            candidate_indices = [chunk_i]
-        # otherwise keep extending the candidate
-        else:
-            candidate.append(chunk)
-            candidate_indices.append(chunk_i)
-    # add the remaining candidate to output if it's not empty
-    if (header is not None and len(candidate) > 1) or (header is None and len(candidate) > 0):
-        output.append(chunk_delimiter.join(candidate))
-        output_indices.append(candidate_indices)
-    return output, output_indices, dropped_chunk_count
-
-
-def summarize(text: str,
-              detail: float = 0,
-              model: str = 'gpt-4-turbo',
-              additional_instructions: Optional[str] = None,
-              minimum_chunk_size: Optional[int] = 500,
-              chunk_delimiter: str = ".",
-              summarize_recursively=False,
-              verbose=False):
-    """
-    Summarizes a given text by splitting it into chunks, each of which is summarized individually.
-    The level of detail in the summary can be adjusted, and the process can optionally be made recursive.
-
-    Parameters: - text (str): The text to be summarized. - detail (float, optional): A value between 0 and 1
-    indicating the desired level of detail in the summary. 0 leads to a higher level summary, and 1 results in a more
-    detailed summary. Defaults to 0. - model (str, optional): The model to use for generating summaries. Defaults to
-    'gpt-3.5-turbo'. - additional_instructions (Optional[str], optional): Additional instructions to provide to the
-    model for customizing summaries. - minimum_chunk_size (Optional[int], optional): The minimum size for text
-    chunks. Defaults to 500. - chunk_delimiter (str, optional): The delimiter used to split the text into chunks.
-    Defaults to ".". - summarize_recursively (bool, optional): If True, summaries are generated recursively,
-    using previous summaries for context. - verbose (bool, optional): If True, prints detailed information about the
-    chunking process.
-
-    Returns:
-    - str: The final compiled summary of the text.
-
-    The function first determines the number of chunks by interpolating between a minimum and a maximum chunk count
-    based on the `detail` parameter. It then splits the text into chunks and summarizes each chunk. If
-    `summarize_recursively` is True, each summary is based on the previous summaries, adding more context to the
-    summarization process. The function returns a compiled summary of all chunks.
-    """
-
-    # check detail is set correctly
-    assert 0 <= detail <= 1
-
-    # interpolate the number of chunks based to get specified level of detail
-    max_chunks = len(chunk_on_delimiter(text, minimum_chunk_size, chunk_delimiter))
-    min_chunks = 1
-    num_chunks = int(min_chunks + detail * (max_chunks - min_chunks))
-
-    # adjust chunk_size based on interpolated number of chunks
-    document_length = len(tokenize(text))
-    chunk_size = max(minimum_chunk_size, document_length // num_chunks)
-    text_chunks = chunk_on_delimiter(text, chunk_size, chunk_delimiter)
-    if verbose:
-        print(f"Splitting the text into {len(text_chunks)} chunks to be summarized.")
-        print(f"Chunk lengths are {[len(tokenize(x)) for x in text_chunks]}")
-
-    # set system message
-    system_message_content = "Rewrite this text in summarized form."
-    if additional_instructions is not None:
-        system_message_content += f"\n\n{additional_instructions}"
-
-    accumulated_summaries = []
-    for chunk in tqdm(text_chunks):
-        if summarize_recursively and accumulated_summaries:
-            # Creating a structured prompt for recursive summarization
-            accumulated_summaries_string = '\n\n'.join(accumulated_summaries)
-            user_message_content = f"Previous summaries:\n\n{accumulated_summaries_string}\n\nText to summarize next:\n\n{chunk}"
-        else:
-            # Directly passing the chunk for summarization without recursive context
-            user_message_content = chunk
-
-        # Constructing messages based on whether recursive summarization is applied
-        messages = [
-            {"role": "system", "content": system_message_content},
-            {"role": "user", "content": user_message_content}
-        ]
-
-        # Assuming this function gets the completion and works as expected
-        response = get_chat_completion(messages, model=model)
-        accumulated_summaries.append(response)
-
-    # Compile final summary from partial summaries
-    final_summary = '\n\n'.join(accumulated_summaries)
-
-    return final_summary
-
-# Summary at 0 detail
-summary_with_detail_0 = summarize(artificial_intelligence, detail=0, verbose=True)
-
-
-# Summary at 0.25 detail
-summary_with_detail_pt25 = summarize(artificial_intelligence, detail=0.25, verbose=True)
-
-
-# Summary at 0.5 detail
-summary_with_detail_pt5 = summarize(artificial_intelligence, detail=0.5, verbose=True)
-
-
-# Summary at 0.75 detail
-summary_with_detail_pt75 = summarize(artificial_intelligence, detail=0.75, verbose=True)
-
-
-# Summart at 1 detail
-summary_with_detail_1 = summarize(artificial_intelligence, detail=1, verbose=True)
-
-
-# Lengths of summaries:
-[len(tokenize(x)) for x in
- [summary_with_detail_0, summary_with_detail_pt25, summary_with_detail_pt5, summary_with_detail_pt75, summary_with_detail_1]]
-
-# print 0 detail summary
-print(summary_with_detail_0)
-
-
-# print 0.25 detail summary
-print(summary_with_detail_pt25)
-
-
-# print 0.5 detail summary
-print(summary_with_detail_pt5)
-
-
-# print 0.75 detail summary
-print(summary_with_detail_pt75)
-
-
-# print 1.0 detail summary
-print(summary_with_detail_1)
-
-
-# Print summary using additional instructions:
-summary_with_additional_instructions = summarize(artificial_intelligence, detail=0.1,
-                                                 additional_instructions="Write in point form and focus on numerical data.")
-print(summary_with_additional_instructions)
-
-
-# Print summary using recursive summarization:
-recursive_summary = summarize(artificial_intelligence, detail=0.1, summarize_recursively=True)
-print(recursive_summary)
-
--- a/summarize.py
+++ b/summarize.py
@@ -318,12 +318,7 @@ def read_paths_from_file(file_path):
    """ Reads a file containing URLs or local file paths and returns them as a list. """
    paths = []  # Initialize paths as an empty list
    with open(file_path, 'r') as file:
-        for line in file:
-            line = line.strip()
-            if line and not os.path.exists(
-                    os.path.join('results', normalize_title(line.split('/')[-1].split('.')[0]) + '.json')):
-                logging.debug("line successfully imported from file and added to list to be transcribed")
-                paths.append(line)
+        paths = [line.strip() for line in file]
    return paths


@@ -331,10 +326,12 @@ def process_path(path):
    """ Decides whether the path is a URL or a local file and processes accordingly. """
    if path.startswith('http'):
        logging.debug("file is a URL")
-        return get_youtube(path)  # For YouTube URLs, modify to download and extract info
+        # For YouTube URLs, modify to download and extract info
+        return get_youtube(path)
    elif os.path.exists(path):
        logging.debug("File is a path")
-        return process_local_file(path)  # For local files, define a function to handle them
+        # For local files, define a function to handle them
+        return process_local_file(path)
    else:
        logging.error(f"Path does not exist: {path}")
        return None
@@ -1668,7 +1665,7 @@ def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model=
                    #         except requests.exceptions.ConnectionError:
                    #             requests.status_code = "Connection: "
                # Perform summarization based on the specified API
-                elif api_name and api_key:
+                elif api_name:
                    logging.debug(f"MAIN: Summarization being performed by {api_name}")
                    json_file_path = audio_file.replace('.wav', '.segments.json')
                    if api_name.lower() == 'openai':
@@ -1758,7 +1755,7 @@ def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model=
        #end_time = time.monotonic()
        # print("Total program execution time: " + timedelta(seconds=end_time - start_time))

-        return results
+    return results


 if __name__ == "__main__":
@@ -1793,6 +1790,7 @@ if __name__ == "__main__":
    logging.basicConfig(level=getattr(logging, log_level), format='%(asctime)s - %(levelname)s - %(message)s')

    custom_prompt = args.custom_prompt
+
    if custom_prompt == "":
        logging.debug(f"Custom prompt defined, will use \n\nf{custom_prompt} \n\nas the prompt")
        print(f"Custom Prompt has been defined. Custom prompt: \n\n {args.custom_prompt}")
@@ -1808,7 +1806,6 @@ if __name__ == "__main__":
        print("No custom prompt defined, will use default")

    if args.user_interface:
-
        launch_ui(demo_mode=False)
    else:
        if not args.input_path:
@@ -1835,6 +1832,8 @@ if __name__ == "__main__":
        # Get all API keys from the config
        api_keys = {key: value for key, value in config.items('API') if key.endswith('_api_key')}

+        api_name = args.api_name
+
        # Rolling Summarization will only be performed if an API is specified and the API key is available
        # and the rolling summarization flag is set
        #