And its back to working.

Ok. So, rolling summary works with chatGPT via CLI. Passing in a list works via CLI. Summarization of said list works via CLI. Demo GUI works. Dark/Light mode toggle does not work. Simple/Advanced mode toggle does not work. Detail slider in the GUI does not work. No current option for rolling summarization in the GUI. Lack of 're-summarize/ask a question about the transcription' box in the GUI.
2026-03-10 08:51:17 +00:00 · 2024-05-15 22:15:36 -07:00
parent 3147fdec42
commit c82fac1a82
2 changed files with 10 additions and 238 deletions
--- a/Long_Summarize_openai.py
+++ b/Long_Summarize_openai.py
@@ -1,227 +0,0 @@
 from typing import List, Tuple, Optional
 from openai import OpenAI
 import tiktoken
 from tqdm import tqdm
 # script from: https://github.com/openai/openai-cookbook/blob/main/examples/Summarizing_long_documents.ipynb
 # Open dataset
 with open(".\\tldw-original-scripts\\Samples\\ai_wikipedia.txt", "r") as file:
    artificial_intelligence = file.read()
 # load encoding and check length of dataset
 encoding = tiktoken.encoding_for_model('gpt-4-turbo')
 print(len(encoding.encode(artificial_intelligence)))
 # Call wrapper to OpenAI
 client = OpenAI(api_key="")
 def get_chat_completion(messages, model='gpt-4-turbo'):
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0,
    )
    return response.choices[0].message.content
 # Message Chunking <----- THE JUICY STUFF
 def tokenize(text: str) -> List[str]:
    encoding = tiktoken.encoding_for_model('gpt-4-turbo')
    return encoding.encode(text)
 # This function chunks a text into smaller pieces based on a maximum token count and a delimiter
 def chunk_on_delimiter(input_string: str,
                       max_tokens: int,
                       delimiter: str) -> List[str]:
    chunks = input_string.split(delimiter)
    combined_chunks, _, dropped_chunk_count = combine_chunks_with_no_minimum(
        chunks, max_tokens, chunk_delimiter=delimiter, add_ellipsis_for_overflow=True
    )
    if dropped_chunk_count > 0:
        print(f"Warning: {dropped_chunk_count} chunks were dropped due to exceeding the token limit.")
    combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks]
    return combined_chunks
 # This function combines text chunks into larger blocks without exceeding a specified token count.
 #   It returns the combined chunks, their original indices, and the number of dropped chunks due to overflow.
 def combine_chunks_with_no_minimum(
        chunks: List[str],
        max_tokens: int,
        chunk_delimiter="\n\n",
        header: Optional[str] = None,
        add_ellipsis_for_overflow=False,
 ) -> Tuple[List[str], List[int]]:
    dropped_chunk_count = 0
    output = []  # list to hold the final combined chunks
    output_indices = []  # list to hold the indices of the final combined chunks
    candidate = (
        [] if header is None else [header]
    )  # list to hold the current combined chunk candidate
    candidate_indices = []
    for chunk_i, chunk in enumerate(chunks):
        chunk_with_header = [chunk] if header is None else [header, chunk]
        if len(tokenize(chunk_delimiter.join(chunk_with_header))) > max_tokens:
            print(f"warning: chunk overflow")
            if (
                    add_ellipsis_for_overflow
                    and len(tokenize(chunk_delimiter.join(candidate + ["..."]))) <= max_tokens
            ):
                candidate.append("...")
                dropped_chunk_count += 1
            continue  # this case would break downstream assumptions
        # estimate token count with the current chunk added
        extended_candidate_token_count = len(tokenize(chunk_delimiter.join(candidate + [chunk])))
        # If the token count exceeds max_tokens, add the current candidate to output and start a new candidate
        if extended_candidate_token_count > max_tokens:
            output.append(chunk_delimiter.join(candidate))
            output_indices.append(candidate_indices)
            candidate = chunk_with_header  # re-initialize candidate
            candidate_indices = [chunk_i]
        # otherwise keep extending the candidate
        else:
            candidate.append(chunk)
            candidate_indices.append(chunk_i)
    # add the remaining candidate to output if it's not empty
    if (header is not None and len(candidate) > 1) or (header is None and len(candidate) > 0):
        output.append(chunk_delimiter.join(candidate))
        output_indices.append(candidate_indices)
    return output, output_indices, dropped_chunk_count
 def summarize(text: str,
              detail: float = 0,
              model: str = 'gpt-4-turbo',
              additional_instructions: Optional[str] = None,
              minimum_chunk_size: Optional[int] = 500,
              chunk_delimiter: str = ".",
              summarize_recursively=False,
              verbose=False):
    """
    Summarizes a given text by splitting it into chunks, each of which is summarized individually.
    The level of detail in the summary can be adjusted, and the process can optionally be made recursive.
    Parameters: - text (str): The text to be summarized. - detail (float, optional): A value between 0 and 1
    indicating the desired level of detail in the summary. 0 leads to a higher level summary, and 1 results in a more
    detailed summary. Defaults to 0. - model (str, optional): The model to use for generating summaries. Defaults to
    'gpt-3.5-turbo'. - additional_instructions (Optional[str], optional): Additional instructions to provide to the
    model for customizing summaries. - minimum_chunk_size (Optional[int], optional): The minimum size for text
    chunks. Defaults to 500. - chunk_delimiter (str, optional): The delimiter used to split the text into chunks.
    Defaults to ".". - summarize_recursively (bool, optional): If True, summaries are generated recursively,
    using previous summaries for context. - verbose (bool, optional): If True, prints detailed information about the
    chunking process.
    Returns:
    - str: The final compiled summary of the text.
    The function first determines the number of chunks by interpolating between a minimum and a maximum chunk count
    based on the `detail` parameter. It then splits the text into chunks and summarizes each chunk. If
    `summarize_recursively` is True, each summary is based on the previous summaries, adding more context to the
    summarization process. The function returns a compiled summary of all chunks.
    """
    # check detail is set correctly
    assert 0 <= detail <= 1
    # interpolate the number of chunks based to get specified level of detail
    max_chunks = len(chunk_on_delimiter(text, minimum_chunk_size, chunk_delimiter))
    min_chunks = 1
    num_chunks = int(min_chunks + detail * (max_chunks - min_chunks))
    # adjust chunk_size based on interpolated number of chunks
    document_length = len(tokenize(text))
    chunk_size = max(minimum_chunk_size, document_length // num_chunks)
    text_chunks = chunk_on_delimiter(text, chunk_size, chunk_delimiter)
    if verbose:
        print(f"Splitting the text into {len(text_chunks)} chunks to be summarized.")
        print(f"Chunk lengths are {[len(tokenize(x)) for x in text_chunks]}")
    # set system message
    system_message_content = "Rewrite this text in summarized form."
    if additional_instructions is not None:
        system_message_content += f"\n\n{additional_instructions}"
    accumulated_summaries = []
    for chunk in tqdm(text_chunks):
        if summarize_recursively and accumulated_summaries:
            # Creating a structured prompt for recursive summarization
            accumulated_summaries_string = '\n\n'.join(accumulated_summaries)
            user_message_content = f"Previous summaries:\n\n{accumulated_summaries_string}\n\nText to summarize next:\n\n{chunk}"
        else:
            # Directly passing the chunk for summarization without recursive context
            user_message_content = chunk
        # Constructing messages based on whether recursive summarization is applied
        messages = [
            {"role": "system", "content": system_message_content},
            {"role": "user", "content": user_message_content}
        ]
        # Assuming this function gets the completion and works as expected
        response = get_chat_completion(messages, model=model)
        accumulated_summaries.append(response)
    # Compile final summary from partial summaries
    final_summary = '\n\n'.join(accumulated_summaries)
    return final_summary
 # Summary at 0 detail
 summary_with_detail_0 = summarize(artificial_intelligence, detail=0, verbose=True)
 # Summary at 0.25 detail
 summary_with_detail_pt25 = summarize(artificial_intelligence, detail=0.25, verbose=True)
 # Summary at 0.5 detail
 summary_with_detail_pt5 = summarize(artificial_intelligence, detail=0.5, verbose=True)
 # Summary at 0.75 detail
 summary_with_detail_pt75 = summarize(artificial_intelligence, detail=0.75, verbose=True)
 # Summart at 1 detail
 summary_with_detail_1 = summarize(artificial_intelligence, detail=1, verbose=True)
 # Lengths of summaries:
 [len(tokenize(x)) for x in
 [summary_with_detail_0, summary_with_detail_pt25, summary_with_detail_pt5, summary_with_detail_pt75, summary_with_detail_1]]
 # print 0 detail summary
 print(summary_with_detail_0)
 # print 0.25 detail summary
 print(summary_with_detail_pt25)
 # print 0.5 detail summary
 print(summary_with_detail_pt5)
 # print 0.75 detail summary
 print(summary_with_detail_pt75)
 # print 1.0 detail summary
 print(summary_with_detail_1)
 # Print summary using additional instructions:
 summary_with_additional_instructions = summarize(artificial_intelligence, detail=0.1,
                                                 additional_instructions="Write in point form and focus on numerical data.")
 print(summary_with_additional_instructions)
 # Print summary using recursive summarization:
 recursive_summary = summarize(artificial_intelligence, detail=0.1, summarize_recursively=True)
 print(recursive_summary)
--- a/summarize.py
+++ b/summarize.py
@@ -318,12 +318,7 @@ def read_paths_from_file(file_path):
    """ Reads a file containing URLs or local file paths and returns them as a list. """
    paths = []  # Initialize paths as an empty list
    with open(file_path, 'r') as file:
-        for line in file:
+        paths = [line.strip() for line in file]
            line = line.strip()
            if line and not os.path.exists(
                    os.path.join('results', normalize_title(line.split('/')[-1].split('.')[0]) + '.json')):
                logging.debug("line successfully imported from file and added to list to be transcribed")
                paths.append(line)
    return paths
@@ -331,10 +326,12 @@ def process_path(path):
    """ Decides whether the path is a URL or a local file and processes accordingly. """
    if path.startswith('http'):
        logging.debug("file is a URL")
-        return get_youtube(path)  # For YouTube URLs, modify to download and extract info
+        # For YouTube URLs, modify to download and extract info
        return get_youtube(path)
    elif os.path.exists(path):
        logging.debug("File is a path")
-        return process_local_file(path)  # For local files, define a function to handle them
+        # For local files, define a function to handle them
        return process_local_file(path)
    else:
        logging.error(f"Path does not exist: {path}")
        return None
@@ -1668,7 +1665,7 @@ def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model=
                    #         except requests.exceptions.ConnectionError:
                    #             requests.status_code = "Connection: "
                # Perform summarization based on the specified API
-                elif api_name and api_key:
+                elif api_name:
                    logging.debug(f"MAIN: Summarization being performed by {api_name}")
                    json_file_path = audio_file.replace('.wav', '.segments.json')
                    if api_name.lower() == 'openai':
@@ -1758,7 +1755,7 @@ def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model=
        #end_time = time.monotonic()
        # print("Total program execution time: " + timedelta(seconds=end_time - start_time))
-        return results
+    return results
 if __name__ == "__main__":
@@ -1793,6 +1790,7 @@ if __name__ == "__main__":
    logging.basicConfig(level=getattr(logging, log_level), format='%(asctime)s - %(levelname)s - %(message)s')
    custom_prompt = args.custom_prompt
    if custom_prompt == "":
        logging.debug(f"Custom prompt defined, will use \n\nf{custom_prompt} \n\nas the prompt")
        print(f"Custom Prompt has been defined. Custom prompt: \n\n {args.custom_prompt}")
@@ -1808,7 +1806,6 @@ if __name__ == "__main__":
        print("No custom prompt defined, will use default")
    if args.user_interface:
        launch_ui(demo_mode=False)
    else:
        if not args.input_path:
@@ -1835,6 +1832,8 @@ if __name__ == "__main__":
        # Get all API keys from the config
        api_keys = {key: value for key, value in config.items('API') if key.endswith('_api_key')}
        api_name = args.api_name
        # Rolling Summarization will only be performed if an API is specified and the API key is available
        # and the rolling summarization flag is set
        #