mirror of
https://github.com/jlengrand/tldw.git
synced 2026-03-10 08:51:17 +00:00
And its back to working.
Ok. So, rolling summary works with chatGPT via CLI. Passing in a list works via CLI. Summarization of said list works via CLI. Demo GUI works. Dark/Light mode toggle does not work. Simple/Advanced mode toggle does not work. Detail slider in the GUI does not work. No current option for rolling summarization in the GUI. Lack of 're-summarize/ask a question about the transcription' box in the GUI.
This commit is contained in:
@@ -1,227 +0,0 @@
|
|||||||
from typing import List, Tuple, Optional
|
|
||||||
from openai import OpenAI
|
|
||||||
import tiktoken
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
|
|
||||||
# script from: https://github.com/openai/openai-cookbook/blob/main/examples/Summarizing_long_documents.ipynb
|
|
||||||
|
|
||||||
|
|
||||||
# Open dataset
|
|
||||||
with open(".\\tldw-original-scripts\\Samples\\ai_wikipedia.txt", "r") as file:
|
|
||||||
artificial_intelligence = file.read()
|
|
||||||
|
|
||||||
# load encoding and check length of dataset
|
|
||||||
encoding = tiktoken.encoding_for_model('gpt-4-turbo')
|
|
||||||
print(len(encoding.encode(artificial_intelligence)))
|
|
||||||
|
|
||||||
# Call wrapper to OpenAI
|
|
||||||
client = OpenAI(api_key="")
|
|
||||||
|
|
||||||
|
|
||||||
def get_chat_completion(messages, model='gpt-4-turbo'):
|
|
||||||
response = client.chat.completions.create(
|
|
||||||
model=model,
|
|
||||||
messages=messages,
|
|
||||||
temperature=0,
|
|
||||||
)
|
|
||||||
return response.choices[0].message.content
|
|
||||||
|
|
||||||
|
|
||||||
# Message Chunking <----- THE JUICY STUFF
|
|
||||||
def tokenize(text: str) -> List[str]:
|
|
||||||
encoding = tiktoken.encoding_for_model('gpt-4-turbo')
|
|
||||||
return encoding.encode(text)
|
|
||||||
|
|
||||||
|
|
||||||
# This function chunks a text into smaller pieces based on a maximum token count and a delimiter
|
|
||||||
def chunk_on_delimiter(input_string: str,
|
|
||||||
max_tokens: int,
|
|
||||||
delimiter: str) -> List[str]:
|
|
||||||
chunks = input_string.split(delimiter)
|
|
||||||
combined_chunks, _, dropped_chunk_count = combine_chunks_with_no_minimum(
|
|
||||||
chunks, max_tokens, chunk_delimiter=delimiter, add_ellipsis_for_overflow=True
|
|
||||||
)
|
|
||||||
if dropped_chunk_count > 0:
|
|
||||||
print(f"Warning: {dropped_chunk_count} chunks were dropped due to exceeding the token limit.")
|
|
||||||
combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks]
|
|
||||||
return combined_chunks
|
|
||||||
|
|
||||||
|
|
||||||
# This function combines text chunks into larger blocks without exceeding a specified token count.
|
|
||||||
# It returns the combined chunks, their original indices, and the number of dropped chunks due to overflow.
|
|
||||||
def combine_chunks_with_no_minimum(
|
|
||||||
chunks: List[str],
|
|
||||||
max_tokens: int,
|
|
||||||
chunk_delimiter="\n\n",
|
|
||||||
header: Optional[str] = None,
|
|
||||||
add_ellipsis_for_overflow=False,
|
|
||||||
) -> Tuple[List[str], List[int]]:
|
|
||||||
dropped_chunk_count = 0
|
|
||||||
output = [] # list to hold the final combined chunks
|
|
||||||
output_indices = [] # list to hold the indices of the final combined chunks
|
|
||||||
candidate = (
|
|
||||||
[] if header is None else [header]
|
|
||||||
) # list to hold the current combined chunk candidate
|
|
||||||
candidate_indices = []
|
|
||||||
for chunk_i, chunk in enumerate(chunks):
|
|
||||||
chunk_with_header = [chunk] if header is None else [header, chunk]
|
|
||||||
if len(tokenize(chunk_delimiter.join(chunk_with_header))) > max_tokens:
|
|
||||||
print(f"warning: chunk overflow")
|
|
||||||
if (
|
|
||||||
add_ellipsis_for_overflow
|
|
||||||
and len(tokenize(chunk_delimiter.join(candidate + ["..."]))) <= max_tokens
|
|
||||||
):
|
|
||||||
candidate.append("...")
|
|
||||||
dropped_chunk_count += 1
|
|
||||||
continue # this case would break downstream assumptions
|
|
||||||
# estimate token count with the current chunk added
|
|
||||||
extended_candidate_token_count = len(tokenize(chunk_delimiter.join(candidate + [chunk])))
|
|
||||||
# If the token count exceeds max_tokens, add the current candidate to output and start a new candidate
|
|
||||||
if extended_candidate_token_count > max_tokens:
|
|
||||||
output.append(chunk_delimiter.join(candidate))
|
|
||||||
output_indices.append(candidate_indices)
|
|
||||||
candidate = chunk_with_header # re-initialize candidate
|
|
||||||
candidate_indices = [chunk_i]
|
|
||||||
# otherwise keep extending the candidate
|
|
||||||
else:
|
|
||||||
candidate.append(chunk)
|
|
||||||
candidate_indices.append(chunk_i)
|
|
||||||
# add the remaining candidate to output if it's not empty
|
|
||||||
if (header is not None and len(candidate) > 1) or (header is None and len(candidate) > 0):
|
|
||||||
output.append(chunk_delimiter.join(candidate))
|
|
||||||
output_indices.append(candidate_indices)
|
|
||||||
return output, output_indices, dropped_chunk_count
|
|
||||||
|
|
||||||
|
|
||||||
def summarize(text: str,
|
|
||||||
detail: float = 0,
|
|
||||||
model: str = 'gpt-4-turbo',
|
|
||||||
additional_instructions: Optional[str] = None,
|
|
||||||
minimum_chunk_size: Optional[int] = 500,
|
|
||||||
chunk_delimiter: str = ".",
|
|
||||||
summarize_recursively=False,
|
|
||||||
verbose=False):
|
|
||||||
"""
|
|
||||||
Summarizes a given text by splitting it into chunks, each of which is summarized individually.
|
|
||||||
The level of detail in the summary can be adjusted, and the process can optionally be made recursive.
|
|
||||||
|
|
||||||
Parameters: - text (str): The text to be summarized. - detail (float, optional): A value between 0 and 1
|
|
||||||
indicating the desired level of detail in the summary. 0 leads to a higher level summary, and 1 results in a more
|
|
||||||
detailed summary. Defaults to 0. - model (str, optional): The model to use for generating summaries. Defaults to
|
|
||||||
'gpt-3.5-turbo'. - additional_instructions (Optional[str], optional): Additional instructions to provide to the
|
|
||||||
model for customizing summaries. - minimum_chunk_size (Optional[int], optional): The minimum size for text
|
|
||||||
chunks. Defaults to 500. - chunk_delimiter (str, optional): The delimiter used to split the text into chunks.
|
|
||||||
Defaults to ".". - summarize_recursively (bool, optional): If True, summaries are generated recursively,
|
|
||||||
using previous summaries for context. - verbose (bool, optional): If True, prints detailed information about the
|
|
||||||
chunking process.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
- str: The final compiled summary of the text.
|
|
||||||
|
|
||||||
The function first determines the number of chunks by interpolating between a minimum and a maximum chunk count
|
|
||||||
based on the `detail` parameter. It then splits the text into chunks and summarizes each chunk. If
|
|
||||||
`summarize_recursively` is True, each summary is based on the previous summaries, adding more context to the
|
|
||||||
summarization process. The function returns a compiled summary of all chunks.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# check detail is set correctly
|
|
||||||
assert 0 <= detail <= 1
|
|
||||||
|
|
||||||
# interpolate the number of chunks based to get specified level of detail
|
|
||||||
max_chunks = len(chunk_on_delimiter(text, minimum_chunk_size, chunk_delimiter))
|
|
||||||
min_chunks = 1
|
|
||||||
num_chunks = int(min_chunks + detail * (max_chunks - min_chunks))
|
|
||||||
|
|
||||||
# adjust chunk_size based on interpolated number of chunks
|
|
||||||
document_length = len(tokenize(text))
|
|
||||||
chunk_size = max(minimum_chunk_size, document_length // num_chunks)
|
|
||||||
text_chunks = chunk_on_delimiter(text, chunk_size, chunk_delimiter)
|
|
||||||
if verbose:
|
|
||||||
print(f"Splitting the text into {len(text_chunks)} chunks to be summarized.")
|
|
||||||
print(f"Chunk lengths are {[len(tokenize(x)) for x in text_chunks]}")
|
|
||||||
|
|
||||||
# set system message
|
|
||||||
system_message_content = "Rewrite this text in summarized form."
|
|
||||||
if additional_instructions is not None:
|
|
||||||
system_message_content += f"\n\n{additional_instructions}"
|
|
||||||
|
|
||||||
accumulated_summaries = []
|
|
||||||
for chunk in tqdm(text_chunks):
|
|
||||||
if summarize_recursively and accumulated_summaries:
|
|
||||||
# Creating a structured prompt for recursive summarization
|
|
||||||
accumulated_summaries_string = '\n\n'.join(accumulated_summaries)
|
|
||||||
user_message_content = f"Previous summaries:\n\n{accumulated_summaries_string}\n\nText to summarize next:\n\n{chunk}"
|
|
||||||
else:
|
|
||||||
# Directly passing the chunk for summarization without recursive context
|
|
||||||
user_message_content = chunk
|
|
||||||
|
|
||||||
# Constructing messages based on whether recursive summarization is applied
|
|
||||||
messages = [
|
|
||||||
{"role": "system", "content": system_message_content},
|
|
||||||
{"role": "user", "content": user_message_content}
|
|
||||||
]
|
|
||||||
|
|
||||||
# Assuming this function gets the completion and works as expected
|
|
||||||
response = get_chat_completion(messages, model=model)
|
|
||||||
accumulated_summaries.append(response)
|
|
||||||
|
|
||||||
# Compile final summary from partial summaries
|
|
||||||
final_summary = '\n\n'.join(accumulated_summaries)
|
|
||||||
|
|
||||||
return final_summary
|
|
||||||
|
|
||||||
# Summary at 0 detail
|
|
||||||
summary_with_detail_0 = summarize(artificial_intelligence, detail=0, verbose=True)
|
|
||||||
|
|
||||||
|
|
||||||
# Summary at 0.25 detail
|
|
||||||
summary_with_detail_pt25 = summarize(artificial_intelligence, detail=0.25, verbose=True)
|
|
||||||
|
|
||||||
|
|
||||||
# Summary at 0.5 detail
|
|
||||||
summary_with_detail_pt5 = summarize(artificial_intelligence, detail=0.5, verbose=True)
|
|
||||||
|
|
||||||
|
|
||||||
# Summary at 0.75 detail
|
|
||||||
summary_with_detail_pt75 = summarize(artificial_intelligence, detail=0.75, verbose=True)
|
|
||||||
|
|
||||||
|
|
||||||
# Summart at 1 detail
|
|
||||||
summary_with_detail_1 = summarize(artificial_intelligence, detail=1, verbose=True)
|
|
||||||
|
|
||||||
|
|
||||||
# Lengths of summaries:
|
|
||||||
[len(tokenize(x)) for x in
|
|
||||||
[summary_with_detail_0, summary_with_detail_pt25, summary_with_detail_pt5, summary_with_detail_pt75, summary_with_detail_1]]
|
|
||||||
|
|
||||||
# print 0 detail summary
|
|
||||||
print(summary_with_detail_0)
|
|
||||||
|
|
||||||
|
|
||||||
# print 0.25 detail summary
|
|
||||||
print(summary_with_detail_pt25)
|
|
||||||
|
|
||||||
|
|
||||||
# print 0.5 detail summary
|
|
||||||
print(summary_with_detail_pt5)
|
|
||||||
|
|
||||||
|
|
||||||
# print 0.75 detail summary
|
|
||||||
print(summary_with_detail_pt75)
|
|
||||||
|
|
||||||
|
|
||||||
# print 1.0 detail summary
|
|
||||||
print(summary_with_detail_1)
|
|
||||||
|
|
||||||
|
|
||||||
# Print summary using additional instructions:
|
|
||||||
summary_with_additional_instructions = summarize(artificial_intelligence, detail=0.1,
|
|
||||||
additional_instructions="Write in point form and focus on numerical data.")
|
|
||||||
print(summary_with_additional_instructions)
|
|
||||||
|
|
||||||
|
|
||||||
# Print summary using recursive summarization:
|
|
||||||
recursive_summary = summarize(artificial_intelligence, detail=0.1, summarize_recursively=True)
|
|
||||||
print(recursive_summary)
|
|
||||||
|
|
||||||
21
summarize.py
21
summarize.py
@@ -318,12 +318,7 @@ def read_paths_from_file(file_path):
|
|||||||
""" Reads a file containing URLs or local file paths and returns them as a list. """
|
""" Reads a file containing URLs or local file paths and returns them as a list. """
|
||||||
paths = [] # Initialize paths as an empty list
|
paths = [] # Initialize paths as an empty list
|
||||||
with open(file_path, 'r') as file:
|
with open(file_path, 'r') as file:
|
||||||
for line in file:
|
paths = [line.strip() for line in file]
|
||||||
line = line.strip()
|
|
||||||
if line and not os.path.exists(
|
|
||||||
os.path.join('results', normalize_title(line.split('/')[-1].split('.')[0]) + '.json')):
|
|
||||||
logging.debug("line successfully imported from file and added to list to be transcribed")
|
|
||||||
paths.append(line)
|
|
||||||
return paths
|
return paths
|
||||||
|
|
||||||
|
|
||||||
@@ -331,10 +326,12 @@ def process_path(path):
|
|||||||
""" Decides whether the path is a URL or a local file and processes accordingly. """
|
""" Decides whether the path is a URL or a local file and processes accordingly. """
|
||||||
if path.startswith('http'):
|
if path.startswith('http'):
|
||||||
logging.debug("file is a URL")
|
logging.debug("file is a URL")
|
||||||
return get_youtube(path) # For YouTube URLs, modify to download and extract info
|
# For YouTube URLs, modify to download and extract info
|
||||||
|
return get_youtube(path)
|
||||||
elif os.path.exists(path):
|
elif os.path.exists(path):
|
||||||
logging.debug("File is a path")
|
logging.debug("File is a path")
|
||||||
return process_local_file(path) # For local files, define a function to handle them
|
# For local files, define a function to handle them
|
||||||
|
return process_local_file(path)
|
||||||
else:
|
else:
|
||||||
logging.error(f"Path does not exist: {path}")
|
logging.error(f"Path does not exist: {path}")
|
||||||
return None
|
return None
|
||||||
@@ -1668,7 +1665,7 @@ def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model=
|
|||||||
# except requests.exceptions.ConnectionError:
|
# except requests.exceptions.ConnectionError:
|
||||||
# requests.status_code = "Connection: "
|
# requests.status_code = "Connection: "
|
||||||
# Perform summarization based on the specified API
|
# Perform summarization based on the specified API
|
||||||
elif api_name and api_key:
|
elif api_name:
|
||||||
logging.debug(f"MAIN: Summarization being performed by {api_name}")
|
logging.debug(f"MAIN: Summarization being performed by {api_name}")
|
||||||
json_file_path = audio_file.replace('.wav', '.segments.json')
|
json_file_path = audio_file.replace('.wav', '.segments.json')
|
||||||
if api_name.lower() == 'openai':
|
if api_name.lower() == 'openai':
|
||||||
@@ -1758,7 +1755,7 @@ def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model=
|
|||||||
#end_time = time.monotonic()
|
#end_time = time.monotonic()
|
||||||
# print("Total program execution time: " + timedelta(seconds=end_time - start_time))
|
# print("Total program execution time: " + timedelta(seconds=end_time - start_time))
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
@@ -1793,6 +1790,7 @@ if __name__ == "__main__":
|
|||||||
logging.basicConfig(level=getattr(logging, log_level), format='%(asctime)s - %(levelname)s - %(message)s')
|
logging.basicConfig(level=getattr(logging, log_level), format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
|
|
||||||
custom_prompt = args.custom_prompt
|
custom_prompt = args.custom_prompt
|
||||||
|
|
||||||
if custom_prompt == "":
|
if custom_prompt == "":
|
||||||
logging.debug(f"Custom prompt defined, will use \n\nf{custom_prompt} \n\nas the prompt")
|
logging.debug(f"Custom prompt defined, will use \n\nf{custom_prompt} \n\nas the prompt")
|
||||||
print(f"Custom Prompt has been defined. Custom prompt: \n\n {args.custom_prompt}")
|
print(f"Custom Prompt has been defined. Custom prompt: \n\n {args.custom_prompt}")
|
||||||
@@ -1808,7 +1806,6 @@ if __name__ == "__main__":
|
|||||||
print("No custom prompt defined, will use default")
|
print("No custom prompt defined, will use default")
|
||||||
|
|
||||||
if args.user_interface:
|
if args.user_interface:
|
||||||
|
|
||||||
launch_ui(demo_mode=False)
|
launch_ui(demo_mode=False)
|
||||||
else:
|
else:
|
||||||
if not args.input_path:
|
if not args.input_path:
|
||||||
@@ -1835,6 +1832,8 @@ if __name__ == "__main__":
|
|||||||
# Get all API keys from the config
|
# Get all API keys from the config
|
||||||
api_keys = {key: value for key, value in config.items('API') if key.endswith('_api_key')}
|
api_keys = {key: value for key, value in config.items('API') if key.endswith('_api_key')}
|
||||||
|
|
||||||
|
api_name = args.api_name
|
||||||
|
|
||||||
# Rolling Summarization will only be performed if an API is specified and the API key is available
|
# Rolling Summarization will only be performed if an API is specified and the API key is available
|
||||||
# and the rolling summarization flag is set
|
# and the rolling summarization flag is set
|
||||||
#
|
#
|
||||||
|
|||||||
Reference in New Issue
Block a user