mirror of
https://github.com/jlengrand/tldw.git
synced 2026-03-10 00:41:17 +00:00
And its back to working.
Ok. So, rolling summary works with chatGPT via CLI. Passing in a list works via CLI. Summarization of said list works via CLI. Demo GUI works. Dark/Light mode toggle does not work. Simple/Advanced mode toggle does not work. Detail slider in the GUI does not work. No current option for rolling summarization in the GUI. Lack of 're-summarize/ask a question about the transcription' box in the GUI.
This commit is contained in:
@@ -1,227 +0,0 @@
|
||||
from typing import List, Tuple, Optional
|
||||
from openai import OpenAI
|
||||
import tiktoken
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
# script from: https://github.com/openai/openai-cookbook/blob/main/examples/Summarizing_long_documents.ipynb
|
||||
|
||||
|
||||
# Open dataset
|
||||
with open(".\\tldw-original-scripts\\Samples\\ai_wikipedia.txt", "r") as file:
|
||||
artificial_intelligence = file.read()
|
||||
|
||||
# load encoding and check length of dataset
|
||||
encoding = tiktoken.encoding_for_model('gpt-4-turbo')
|
||||
print(len(encoding.encode(artificial_intelligence)))
|
||||
|
||||
# Call wrapper to OpenAI
|
||||
client = OpenAI(api_key="")
|
||||
|
||||
|
||||
def get_chat_completion(messages, model='gpt-4-turbo'):
|
||||
response = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=messages,
|
||||
temperature=0,
|
||||
)
|
||||
return response.choices[0].message.content
|
||||
|
||||
|
||||
# Message Chunking <----- THE JUICY STUFF
|
||||
def tokenize(text: str) -> List[str]:
|
||||
encoding = tiktoken.encoding_for_model('gpt-4-turbo')
|
||||
return encoding.encode(text)
|
||||
|
||||
|
||||
# This function chunks a text into smaller pieces based on a maximum token count and a delimiter
|
||||
def chunk_on_delimiter(input_string: str,
|
||||
max_tokens: int,
|
||||
delimiter: str) -> List[str]:
|
||||
chunks = input_string.split(delimiter)
|
||||
combined_chunks, _, dropped_chunk_count = combine_chunks_with_no_minimum(
|
||||
chunks, max_tokens, chunk_delimiter=delimiter, add_ellipsis_for_overflow=True
|
||||
)
|
||||
if dropped_chunk_count > 0:
|
||||
print(f"Warning: {dropped_chunk_count} chunks were dropped due to exceeding the token limit.")
|
||||
combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks]
|
||||
return combined_chunks
|
||||
|
||||
|
||||
# This function combines text chunks into larger blocks without exceeding a specified token count.
|
||||
# It returns the combined chunks, their original indices, and the number of dropped chunks due to overflow.
|
||||
def combine_chunks_with_no_minimum(
|
||||
chunks: List[str],
|
||||
max_tokens: int,
|
||||
chunk_delimiter="\n\n",
|
||||
header: Optional[str] = None,
|
||||
add_ellipsis_for_overflow=False,
|
||||
) -> Tuple[List[str], List[int]]:
|
||||
dropped_chunk_count = 0
|
||||
output = [] # list to hold the final combined chunks
|
||||
output_indices = [] # list to hold the indices of the final combined chunks
|
||||
candidate = (
|
||||
[] if header is None else [header]
|
||||
) # list to hold the current combined chunk candidate
|
||||
candidate_indices = []
|
||||
for chunk_i, chunk in enumerate(chunks):
|
||||
chunk_with_header = [chunk] if header is None else [header, chunk]
|
||||
if len(tokenize(chunk_delimiter.join(chunk_with_header))) > max_tokens:
|
||||
print(f"warning: chunk overflow")
|
||||
if (
|
||||
add_ellipsis_for_overflow
|
||||
and len(tokenize(chunk_delimiter.join(candidate + ["..."]))) <= max_tokens
|
||||
):
|
||||
candidate.append("...")
|
||||
dropped_chunk_count += 1
|
||||
continue # this case would break downstream assumptions
|
||||
# estimate token count with the current chunk added
|
||||
extended_candidate_token_count = len(tokenize(chunk_delimiter.join(candidate + [chunk])))
|
||||
# If the token count exceeds max_tokens, add the current candidate to output and start a new candidate
|
||||
if extended_candidate_token_count > max_tokens:
|
||||
output.append(chunk_delimiter.join(candidate))
|
||||
output_indices.append(candidate_indices)
|
||||
candidate = chunk_with_header # re-initialize candidate
|
||||
candidate_indices = [chunk_i]
|
||||
# otherwise keep extending the candidate
|
||||
else:
|
||||
candidate.append(chunk)
|
||||
candidate_indices.append(chunk_i)
|
||||
# add the remaining candidate to output if it's not empty
|
||||
if (header is not None and len(candidate) > 1) or (header is None and len(candidate) > 0):
|
||||
output.append(chunk_delimiter.join(candidate))
|
||||
output_indices.append(candidate_indices)
|
||||
return output, output_indices, dropped_chunk_count
|
||||
|
||||
|
||||
def summarize(text: str,
|
||||
detail: float = 0,
|
||||
model: str = 'gpt-4-turbo',
|
||||
additional_instructions: Optional[str] = None,
|
||||
minimum_chunk_size: Optional[int] = 500,
|
||||
chunk_delimiter: str = ".",
|
||||
summarize_recursively=False,
|
||||
verbose=False):
|
||||
"""
|
||||
Summarizes a given text by splitting it into chunks, each of which is summarized individually.
|
||||
The level of detail in the summary can be adjusted, and the process can optionally be made recursive.
|
||||
|
||||
Parameters: - text (str): The text to be summarized. - detail (float, optional): A value between 0 and 1
|
||||
indicating the desired level of detail in the summary. 0 leads to a higher level summary, and 1 results in a more
|
||||
detailed summary. Defaults to 0. - model (str, optional): The model to use for generating summaries. Defaults to
|
||||
'gpt-3.5-turbo'. - additional_instructions (Optional[str], optional): Additional instructions to provide to the
|
||||
model for customizing summaries. - minimum_chunk_size (Optional[int], optional): The minimum size for text
|
||||
chunks. Defaults to 500. - chunk_delimiter (str, optional): The delimiter used to split the text into chunks.
|
||||
Defaults to ".". - summarize_recursively (bool, optional): If True, summaries are generated recursively,
|
||||
using previous summaries for context. - verbose (bool, optional): If True, prints detailed information about the
|
||||
chunking process.
|
||||
|
||||
Returns:
|
||||
- str: The final compiled summary of the text.
|
||||
|
||||
The function first determines the number of chunks by interpolating between a minimum and a maximum chunk count
|
||||
based on the `detail` parameter. It then splits the text into chunks and summarizes each chunk. If
|
||||
`summarize_recursively` is True, each summary is based on the previous summaries, adding more context to the
|
||||
summarization process. The function returns a compiled summary of all chunks.
|
||||
"""
|
||||
|
||||
# check detail is set correctly
|
||||
assert 0 <= detail <= 1
|
||||
|
||||
# interpolate the number of chunks based to get specified level of detail
|
||||
max_chunks = len(chunk_on_delimiter(text, minimum_chunk_size, chunk_delimiter))
|
||||
min_chunks = 1
|
||||
num_chunks = int(min_chunks + detail * (max_chunks - min_chunks))
|
||||
|
||||
# adjust chunk_size based on interpolated number of chunks
|
||||
document_length = len(tokenize(text))
|
||||
chunk_size = max(minimum_chunk_size, document_length // num_chunks)
|
||||
text_chunks = chunk_on_delimiter(text, chunk_size, chunk_delimiter)
|
||||
if verbose:
|
||||
print(f"Splitting the text into {len(text_chunks)} chunks to be summarized.")
|
||||
print(f"Chunk lengths are {[len(tokenize(x)) for x in text_chunks]}")
|
||||
|
||||
# set system message
|
||||
system_message_content = "Rewrite this text in summarized form."
|
||||
if additional_instructions is not None:
|
||||
system_message_content += f"\n\n{additional_instructions}"
|
||||
|
||||
accumulated_summaries = []
|
||||
for chunk in tqdm(text_chunks):
|
||||
if summarize_recursively and accumulated_summaries:
|
||||
# Creating a structured prompt for recursive summarization
|
||||
accumulated_summaries_string = '\n\n'.join(accumulated_summaries)
|
||||
user_message_content = f"Previous summaries:\n\n{accumulated_summaries_string}\n\nText to summarize next:\n\n{chunk}"
|
||||
else:
|
||||
# Directly passing the chunk for summarization without recursive context
|
||||
user_message_content = chunk
|
||||
|
||||
# Constructing messages based on whether recursive summarization is applied
|
||||
messages = [
|
||||
{"role": "system", "content": system_message_content},
|
||||
{"role": "user", "content": user_message_content}
|
||||
]
|
||||
|
||||
# Assuming this function gets the completion and works as expected
|
||||
response = get_chat_completion(messages, model=model)
|
||||
accumulated_summaries.append(response)
|
||||
|
||||
# Compile final summary from partial summaries
|
||||
final_summary = '\n\n'.join(accumulated_summaries)
|
||||
|
||||
return final_summary
|
||||
|
||||
# Summary at 0 detail
|
||||
summary_with_detail_0 = summarize(artificial_intelligence, detail=0, verbose=True)
|
||||
|
||||
|
||||
# Summary at 0.25 detail
|
||||
summary_with_detail_pt25 = summarize(artificial_intelligence, detail=0.25, verbose=True)
|
||||
|
||||
|
||||
# Summary at 0.5 detail
|
||||
summary_with_detail_pt5 = summarize(artificial_intelligence, detail=0.5, verbose=True)
|
||||
|
||||
|
||||
# Summary at 0.75 detail
|
||||
summary_with_detail_pt75 = summarize(artificial_intelligence, detail=0.75, verbose=True)
|
||||
|
||||
|
||||
# Summart at 1 detail
|
||||
summary_with_detail_1 = summarize(artificial_intelligence, detail=1, verbose=True)
|
||||
|
||||
|
||||
# Lengths of summaries:
|
||||
[len(tokenize(x)) for x in
|
||||
[summary_with_detail_0, summary_with_detail_pt25, summary_with_detail_pt5, summary_with_detail_pt75, summary_with_detail_1]]
|
||||
|
||||
# print 0 detail summary
|
||||
print(summary_with_detail_0)
|
||||
|
||||
|
||||
# print 0.25 detail summary
|
||||
print(summary_with_detail_pt25)
|
||||
|
||||
|
||||
# print 0.5 detail summary
|
||||
print(summary_with_detail_pt5)
|
||||
|
||||
|
||||
# print 0.75 detail summary
|
||||
print(summary_with_detail_pt75)
|
||||
|
||||
|
||||
# print 1.0 detail summary
|
||||
print(summary_with_detail_1)
|
||||
|
||||
|
||||
# Print summary using additional instructions:
|
||||
summary_with_additional_instructions = summarize(artificial_intelligence, detail=0.1,
|
||||
additional_instructions="Write in point form and focus on numerical data.")
|
||||
print(summary_with_additional_instructions)
|
||||
|
||||
|
||||
# Print summary using recursive summarization:
|
||||
recursive_summary = summarize(artificial_intelligence, detail=0.1, summarize_recursively=True)
|
||||
print(recursive_summary)
|
||||
|
||||
21
summarize.py
21
summarize.py
@@ -318,12 +318,7 @@ def read_paths_from_file(file_path):
|
||||
""" Reads a file containing URLs or local file paths and returns them as a list. """
|
||||
paths = [] # Initialize paths as an empty list
|
||||
with open(file_path, 'r') as file:
|
||||
for line in file:
|
||||
line = line.strip()
|
||||
if line and not os.path.exists(
|
||||
os.path.join('results', normalize_title(line.split('/')[-1].split('.')[0]) + '.json')):
|
||||
logging.debug("line successfully imported from file and added to list to be transcribed")
|
||||
paths.append(line)
|
||||
paths = [line.strip() for line in file]
|
||||
return paths
|
||||
|
||||
|
||||
@@ -331,10 +326,12 @@ def process_path(path):
|
||||
""" Decides whether the path is a URL or a local file and processes accordingly. """
|
||||
if path.startswith('http'):
|
||||
logging.debug("file is a URL")
|
||||
return get_youtube(path) # For YouTube URLs, modify to download and extract info
|
||||
# For YouTube URLs, modify to download and extract info
|
||||
return get_youtube(path)
|
||||
elif os.path.exists(path):
|
||||
logging.debug("File is a path")
|
||||
return process_local_file(path) # For local files, define a function to handle them
|
||||
# For local files, define a function to handle them
|
||||
return process_local_file(path)
|
||||
else:
|
||||
logging.error(f"Path does not exist: {path}")
|
||||
return None
|
||||
@@ -1668,7 +1665,7 @@ def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model=
|
||||
# except requests.exceptions.ConnectionError:
|
||||
# requests.status_code = "Connection: "
|
||||
# Perform summarization based on the specified API
|
||||
elif api_name and api_key:
|
||||
elif api_name:
|
||||
logging.debug(f"MAIN: Summarization being performed by {api_name}")
|
||||
json_file_path = audio_file.replace('.wav', '.segments.json')
|
||||
if api_name.lower() == 'openai':
|
||||
@@ -1758,7 +1755,7 @@ def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model=
|
||||
#end_time = time.monotonic()
|
||||
# print("Total program execution time: " + timedelta(seconds=end_time - start_time))
|
||||
|
||||
return results
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@@ -1793,6 +1790,7 @@ if __name__ == "__main__":
|
||||
logging.basicConfig(level=getattr(logging, log_level), format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
custom_prompt = args.custom_prompt
|
||||
|
||||
if custom_prompt == "":
|
||||
logging.debug(f"Custom prompt defined, will use \n\nf{custom_prompt} \n\nas the prompt")
|
||||
print(f"Custom Prompt has been defined. Custom prompt: \n\n {args.custom_prompt}")
|
||||
@@ -1808,7 +1806,6 @@ if __name__ == "__main__":
|
||||
print("No custom prompt defined, will use default")
|
||||
|
||||
if args.user_interface:
|
||||
|
||||
launch_ui(demo_mode=False)
|
||||
else:
|
||||
if not args.input_path:
|
||||
@@ -1835,6 +1832,8 @@ if __name__ == "__main__":
|
||||
# Get all API keys from the config
|
||||
api_keys = {key: value for key, value in config.items('API') if key.endswith('_api_key')}
|
||||
|
||||
api_name = args.api_name
|
||||
|
||||
# Rolling Summarization will only be performed if an API is specified and the API key is available
|
||||
# and the rolling summarization flag is set
|
||||
#
|
||||
|
||||
Reference in New Issue
Block a user