Files
tldw/Long_Summarize_openai.py
Robert 08fcdb6b3a App.py demo now works
App.py drop-down selection should work on HF now...
2024-05-12 01:53:45 -07:00

228 lines
9.3 KiB
Python

from typing import List, Tuple, Optional
from openai import OpenAI
import tiktoken
from tqdm import tqdm
# script from: https://github.com/openai/openai-cookbook/blob/main/examples/Summarizing_long_documents.ipynb
# Open dataset
with open(".\\tldw-original-scripts\\Samples\\ai_wikipedia.txt", "r") as file:
artificial_intelligence = file.read()
# load encoding and check length of dataset
encoding = tiktoken.encoding_for_model('gpt-4-turbo')
print(len(encoding.encode(artificial_intelligence)))
# Call wrapper to OpenAI
client = OpenAI(api_key="")
def get_chat_completion(messages, model='gpt-4-turbo'):
response = client.chat.completions.create(
model=model,
messages=messages,
temperature=0,
)
return response.choices[0].message.content
# Message Chunking <----- THE JUICY STUFF
def tokenize(text: str) -> List[str]:
encoding = tiktoken.encoding_for_model('gpt-4-turbo')
return encoding.encode(text)
# This function chunks a text into smaller pieces based on a maximum token count and a delimiter
def chunk_on_delimiter(input_string: str,
max_tokens: int,
delimiter: str) -> List[str]:
chunks = input_string.split(delimiter)
combined_chunks, _, dropped_chunk_count = combine_chunks_with_no_minimum(
chunks, max_tokens, chunk_delimiter=delimiter, add_ellipsis_for_overflow=True
)
if dropped_chunk_count > 0:
print(f"Warning: {dropped_chunk_count} chunks were dropped due to exceeding the token limit.")
combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks]
return combined_chunks
# This function combines text chunks into larger blocks without exceeding a specified token count.
# It returns the combined chunks, their original indices, and the number of dropped chunks due to overflow.
def combine_chunks_with_no_minimum(
chunks: List[str],
max_tokens: int,
chunk_delimiter="\n\n",
header: Optional[str] = None,
add_ellipsis_for_overflow=False,
) -> Tuple[List[str], List[int]]:
dropped_chunk_count = 0
output = [] # list to hold the final combined chunks
output_indices = [] # list to hold the indices of the final combined chunks
candidate = (
[] if header is None else [header]
) # list to hold the current combined chunk candidate
candidate_indices = []
for chunk_i, chunk in enumerate(chunks):
chunk_with_header = [chunk] if header is None else [header, chunk]
if len(tokenize(chunk_delimiter.join(chunk_with_header))) > max_tokens:
print(f"warning: chunk overflow")
if (
add_ellipsis_for_overflow
and len(tokenize(chunk_delimiter.join(candidate + ["..."]))) <= max_tokens
):
candidate.append("...")
dropped_chunk_count += 1
continue # this case would break downstream assumptions
# estimate token count with the current chunk added
extended_candidate_token_count = len(tokenize(chunk_delimiter.join(candidate + [chunk])))
# If the token count exceeds max_tokens, add the current candidate to output and start a new candidate
if extended_candidate_token_count > max_tokens:
output.append(chunk_delimiter.join(candidate))
output_indices.append(candidate_indices)
candidate = chunk_with_header # re-initialize candidate
candidate_indices = [chunk_i]
# otherwise keep extending the candidate
else:
candidate.append(chunk)
candidate_indices.append(chunk_i)
# add the remaining candidate to output if it's not empty
if (header is not None and len(candidate) > 1) or (header is None and len(candidate) > 0):
output.append(chunk_delimiter.join(candidate))
output_indices.append(candidate_indices)
return output, output_indices, dropped_chunk_count
def summarize(text: str,
detail: float = 0,
model: str = 'gpt-4-turbo',
additional_instructions: Optional[str] = None,
minimum_chunk_size: Optional[int] = 500,
chunk_delimiter: str = ".",
summarize_recursively=False,
verbose=False):
"""
Summarizes a given text by splitting it into chunks, each of which is summarized individually.
The level of detail in the summary can be adjusted, and the process can optionally be made recursive.
Parameters: - text (str): The text to be summarized. - detail (float, optional): A value between 0 and 1
indicating the desired level of detail in the summary. 0 leads to a higher level summary, and 1 results in a more
detailed summary. Defaults to 0. - model (str, optional): The model to use for generating summaries. Defaults to
'gpt-3.5-turbo'. - additional_instructions (Optional[str], optional): Additional instructions to provide to the
model for customizing summaries. - minimum_chunk_size (Optional[int], optional): The minimum size for text
chunks. Defaults to 500. - chunk_delimiter (str, optional): The delimiter used to split the text into chunks.
Defaults to ".". - summarize_recursively (bool, optional): If True, summaries are generated recursively,
using previous summaries for context. - verbose (bool, optional): If True, prints detailed information about the
chunking process.
Returns:
- str: The final compiled summary of the text.
The function first determines the number of chunks by interpolating between a minimum and a maximum chunk count
based on the `detail` parameter. It then splits the text into chunks and summarizes each chunk. If
`summarize_recursively` is True, each summary is based on the previous summaries, adding more context to the
summarization process. The function returns a compiled summary of all chunks.
"""
# check detail is set correctly
assert 0 <= detail <= 1
# interpolate the number of chunks based to get specified level of detail
max_chunks = len(chunk_on_delimiter(text, minimum_chunk_size, chunk_delimiter))
min_chunks = 1
num_chunks = int(min_chunks + detail * (max_chunks - min_chunks))
# adjust chunk_size based on interpolated number of chunks
document_length = len(tokenize(text))
chunk_size = max(minimum_chunk_size, document_length // num_chunks)
text_chunks = chunk_on_delimiter(text, chunk_size, chunk_delimiter)
if verbose:
print(f"Splitting the text into {len(text_chunks)} chunks to be summarized.")
print(f"Chunk lengths are {[len(tokenize(x)) for x in text_chunks]}")
# set system message
system_message_content = "Rewrite this text in summarized form."
if additional_instructions is not None:
system_message_content += f"\n\n{additional_instructions}"
accumulated_summaries = []
for chunk in tqdm(text_chunks):
if summarize_recursively and accumulated_summaries:
# Creating a structured prompt for recursive summarization
accumulated_summaries_string = '\n\n'.join(accumulated_summaries)
user_message_content = f"Previous summaries:\n\n{accumulated_summaries_string}\n\nText to summarize next:\n\n{chunk}"
else:
# Directly passing the chunk for summarization without recursive context
user_message_content = chunk
# Constructing messages based on whether recursive summarization is applied
messages = [
{"role": "system", "content": system_message_content},
{"role": "user", "content": user_message_content}
]
# Assuming this function gets the completion and works as expected
response = get_chat_completion(messages, model=model)
accumulated_summaries.append(response)
# Compile final summary from partial summaries
final_summary = '\n\n'.join(accumulated_summaries)
return final_summary
# Summary at 0 detail
summary_with_detail_0 = summarize(artificial_intelligence, detail=0, verbose=True)
# Summary at 0.25 detail
summary_with_detail_pt25 = summarize(artificial_intelligence, detail=0.25, verbose=True)
# Summary at 0.5 detail
summary_with_detail_pt5 = summarize(artificial_intelligence, detail=0.5, verbose=True)
# Summary at 0.75 detail
summary_with_detail_pt75 = summarize(artificial_intelligence, detail=0.75, verbose=True)
# Summart at 1 detail
summary_with_detail_1 = summarize(artificial_intelligence, detail=1, verbose=True)
# Lengths of summaries:
[len(tokenize(x)) for x in
[summary_with_detail_0, summary_with_detail_pt25, summary_with_detail_pt5, summary_with_detail_pt75, summary_with_detail_1]]
# print 0 detail summary
print(summary_with_detail_0)
# print 0.25 detail summary
print(summary_with_detail_pt25)
# print 0.5 detail summary
print(summary_with_detail_pt5)
# print 0.75 detail summary
print(summary_with_detail_pt75)
# print 1.0 detail summary
print(summary_with_detail_1)
# Print summary using additional instructions:
summary_with_additional_instructions = summarize(artificial_intelligence, detail=0.1,
additional_instructions="Write in point form and focus on numerical data.")
print(summary_with_additional_instructions)
# Print summary using recursive summarization:
recursive_summary = summarize(artificial_intelligence, detail=0.1, summarize_recursively=True)
print(recursive_summary)