mirror of
https://github.com/jlengrand/tldw.git
synced 2026-03-10 08:51:17 +00:00
More app.py fixes...
This commit is contained in:
BIN
.gitignore
vendored
BIN
.gitignore
vendored
Binary file not shown.
167
HF/app.py
167
HF/app.py
@@ -39,7 +39,7 @@ import yt_dlp
|
||||
# 2. Usage of/Hardcoding HF_TOKEN as token for API calls
|
||||
# 3. Usage of HuggingFace for Inference
|
||||
# 4. Other stuff I can't remember. Will eventually do a diff and document them.
|
||||
#
|
||||
#
|
||||
|
||||
|
||||
####
|
||||
@@ -63,10 +63,10 @@ import yt_dlp
|
||||
# llama.cpp)/`ooba` (oobabooga/text-gen-webui)/`kobold` (kobold.cpp)/`tabby` (Tabbyapi)) API:** python summarize.py
|
||||
# -v https://www.youtube.com/watch?v=4nd1CDZP21s -api <your choice of API>` - Make sure to put your API key into
|
||||
# `config.txt` under the appropriate API variable
|
||||
#
|
||||
#
|
||||
# Download Audio+Video from a list of videos in a text file (can be file paths or URLs) and have them all summarized:**
|
||||
# python summarize.py ./local/file_on_your/system --api_name <API_name>`
|
||||
#
|
||||
#
|
||||
# Run it as a WebApp**
|
||||
# python summarize.py -gui` - This requires you to either stuff your API keys into the `config.txt` file, or pass them into the app every time you want to use it.
|
||||
# Can be helpful for setting up a shared instance, but not wanting people to perform inference on your server.
|
||||
@@ -120,7 +120,7 @@ output_path = config.get('Paths', 'output_path', fallback='results')
|
||||
processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
|
||||
|
||||
# Log file
|
||||
#logging.basicConfig(filename='debug-runtime.log', encoding='utf-8', level=logging.DEBUG)
|
||||
# logging.basicConfig(filename='debug-runtime.log', encoding='utf-8', level=logging.DEBUG)
|
||||
|
||||
#
|
||||
#
|
||||
@@ -148,8 +148,8 @@ print(r"""
|
||||
| | | | / / | | | || |/\| |
|
||||
| | | |____ / / | |/ / \ /\ / _
|
||||
\_/ \_____//_/ |___/ \/ \/ (_)
|
||||
|
||||
|
||||
|
||||
|
||||
_ _
|
||||
| | | |
|
||||
| |_ ___ ___ | | ___ _ __ __ _
|
||||
@@ -168,8 +168,8 @@ print(r"""
|
||||
|
||||
####################################################################################################################################
|
||||
# System Checks
|
||||
#
|
||||
#
|
||||
#
|
||||
#
|
||||
|
||||
# Perform Platform Check
|
||||
userOS = ""
|
||||
@@ -291,13 +291,13 @@ def download_ffmpeg():
|
||||
|
||||
|
||||
#
|
||||
#
|
||||
#
|
||||
####################################################################################################################################
|
||||
|
||||
|
||||
####################################################################################################################################
|
||||
# Processing Paths and local file handling
|
||||
#
|
||||
#
|
||||
#
|
||||
|
||||
def read_paths_from_file(file_path):
|
||||
@@ -374,7 +374,7 @@ def process_url(input_path, num_speakers=2, whisper_model="small.en", custom_pro
|
||||
return json_data, summary_file_path, json_file_path, summary_file_path
|
||||
|
||||
else:
|
||||
return json_data, "Summary not available.", json_file_path, None
|
||||
return json_data, "Summary not available.", json_file_path, "Summary not available."
|
||||
|
||||
else:
|
||||
return None, "No results found.", None, None
|
||||
@@ -508,8 +508,8 @@ def download_video(video_url, download_path, info_dict, download_video_flag):
|
||||
]
|
||||
subprocess.run(ffmpeg_command, check=True)
|
||||
else:
|
||||
logging.error("You shouldn't be here...")
|
||||
exit()
|
||||
logging.error("ffmpeg: Unsupported operating system for video download and merging.")
|
||||
raise RuntimeError("ffmpeg: Unsupported operating system for video download and merging.")
|
||||
os.remove(video_file_path)
|
||||
os.remove(audio_file_path)
|
||||
|
||||
@@ -529,7 +529,7 @@ def download_video(video_url, download_path, info_dict, download_video_flag):
|
||||
# https://www.gyan.dev/ffmpeg/builds/
|
||||
#
|
||||
|
||||
#os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
|
||||
# os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
|
||||
def convert_to_wav(video_file_path, offset=0):
|
||||
print("Starting conversion process of .m4a to .WAV")
|
||||
out_path = os.path.splitext(video_file_path)[0] + ".wav"
|
||||
@@ -539,7 +539,8 @@ def convert_to_wav(video_file_path, offset=0):
|
||||
logging.debug("ffmpeg being ran on windows")
|
||||
|
||||
if sys.platform.startswith('win'):
|
||||
ffmpeg_cmd = ".\\Bin\\ffmpeg.exe"
|
||||
ffmpeg_cmd = "..\\Bin\\ffmpeg.exe"
|
||||
logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}")
|
||||
else:
|
||||
ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
|
||||
|
||||
@@ -749,7 +750,7 @@ def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='sm
|
||||
|
||||
|
||||
####################################################################################################################################
|
||||
#Summarizers
|
||||
# Summarizers
|
||||
#
|
||||
#
|
||||
|
||||
@@ -1023,7 +1024,7 @@ def summarize_with_llama(api_url, file_path, token, custom_prompt):
|
||||
logging.debug("API Response Data: %s", response_data)
|
||||
|
||||
if response.status_code == 200:
|
||||
#if 'X' in response_data:
|
||||
# if 'X' in response_data:
|
||||
logging.debug(response_data)
|
||||
summary = response_data['content'].strip()
|
||||
logging.debug("llama: Summarization successful")
|
||||
@@ -1236,28 +1237,11 @@ def process_text(api_key, text_file):
|
||||
return "Notice:", message
|
||||
|
||||
|
||||
def format_file_path(file_path):
|
||||
# Helper function to check file existence and return an appropriate path or message
|
||||
return file_path if file_path and os.path.exists(file_path) else None
|
||||
|
||||
def launch_ui(demo_mode=False):
|
||||
def process_url(url, num_speakers, whisper_model, custom_prompt, offset, api_name, api_key, vad_filter,
|
||||
download_video):
|
||||
try:
|
||||
# Assuming 'main' is the function that handles the processing logic.
|
||||
# Adjust parameters as needed based on your actual 'main' function implementation.
|
||||
results = main(url, api_name=api_name, api_key=api_key, num_speakers=num_speakers,
|
||||
whisper_model=whisper_model, offset=offset, vad_filter=vad_filter,
|
||||
download_video_flag=download_video, custom_prompt=custom_prompt)
|
||||
|
||||
if results:
|
||||
transcription_result = results[0]
|
||||
json_data = transcription_result['transcription']
|
||||
summary_file_path = transcription_result.get('summary', "Summary not available.")
|
||||
json_file_path = transcription_result['audio_file'].replace('.wav', '.segments.json')
|
||||
video_file_path = transcription_result.get('video_path', None)
|
||||
return json_data, summary_file_path, json_file_path, summary_file_path, video_file_path
|
||||
else:
|
||||
return "No results found.", "No summary available.", None, None, None
|
||||
except Exception as e:
|
||||
return str(e), "Error processing the request.", None, None, None
|
||||
|
||||
inputs = [
|
||||
gr.components.Textbox(label="URL", placeholder="Enter the video URL here"),
|
||||
gr.components.Number(value=2, label="Number of Speakers"),
|
||||
@@ -1275,8 +1259,90 @@ def launch_ui(demo_mode=False):
|
||||
outputs = [
|
||||
gr.components.Textbox(label="Transcription"),
|
||||
gr.components.Textbox(label="Summary or Status Message"),
|
||||
gr.components.File(label="Download Transcription as JSON", visible=lambda x: x is not None),
|
||||
gr.components.File(label="Download Summary as Text", visible=lambda x: x is not None),
|
||||
gr.components.File(label="Download Transcription as JSON", visible=lambda x: x != "File not available"),
|
||||
gr.components.File(label="Download Summary as Text", visible=lambda x: x != "File not available"),
|
||||
gr.components.File(label="Download Video", visible=lambda x: x is not None)
|
||||
]
|
||||
|
||||
def process_url(url, num_speakers, whisper_model, custom_prompt, offset, api_name, api_key, vad_filter,
|
||||
download_video):
|
||||
video_file_path = None
|
||||
try:
|
||||
results = main(url, api_name=api_name, api_key=api_key, num_speakers=num_speakers,
|
||||
whisper_model=whisper_model, offset=offset, vad_filter=vad_filter,
|
||||
download_video_flag=download_video, custom_prompt=custom_prompt)
|
||||
if results:
|
||||
transcription_result = results[0]
|
||||
json_file_path = transcription_result['audio_file'].replace('.wav', '.segments.json')
|
||||
summary_file_path = json_file_path.replace('.segments.json', '_summary.txt')
|
||||
|
||||
json_file_path = format_file_path(json_file_path)
|
||||
summary_file_path = format_file_path(summary_file_path)
|
||||
|
||||
return transcription_result['transcription'], "Summary available", json_file_path, summary_file_path, video_file_path
|
||||
else:
|
||||
return "No results found.", "No summary available.", None, None
|
||||
except Exception as e:
|
||||
return str(e), "Error processing the request.", None, None
|
||||
|
||||
iface = gr.Interface(
|
||||
fn=process_url,
|
||||
inputs=inputs,
|
||||
outputs=outputs,
|
||||
title="Video Transcription and Summarization",
|
||||
description="Submit a video URL for transcription and summarization. Ensure you input all necessary information including API keys."
|
||||
)
|
||||
|
||||
iface.launch(share=False)
|
||||
|
||||
|
||||
|
||||
|
||||
a = """def launch_ui(demo_mode=False):
|
||||
def process_url(url, num_speakers, whisper_model, custom_prompt, offset, api_name, api_key, vad_filter,
|
||||
download_video):
|
||||
try:
|
||||
results = main(url, api_name=api_name, api_key=api_key, num_speakers=num_speakers,
|
||||
whisper_model=whisper_model, offset=offset, vad_filter=vad_filter,
|
||||
download_video_flag=download_video, custom_prompt=custom_prompt)
|
||||
|
||||
if results:
|
||||
transcription_result = results[0]
|
||||
json_data = transcription_result['transcription']
|
||||
json_file_path = transcription_result['audio_file'].replace('.wav', '.segments.json')
|
||||
summary_file_path = transcription_result.get('summary', "Summary not available.")
|
||||
video_file_path = transcription_result.get('video_path', None)
|
||||
|
||||
json_file_path = format_file_path(json_file_path)
|
||||
summary_file_path = format_file_path(summary_file_path)
|
||||
|
||||
return json_data, "Summary available", json_file_path, summary_file_path, video_file_path
|
||||
else:
|
||||
return "No results found.", "No summary available.", None, None, None
|
||||
except Exception as e:
|
||||
return str(e), "Error processing the request.", None, None, None, None
|
||||
|
||||
inputs = [
|
||||
gr.components.Textbox(label="URL", placeholder="Enter the video URL here"),
|
||||
gr.components.Number(value=2, label="Number of Speakers"),
|
||||
gr.components.Dropdown(choices=whisper_models, value="small.en", label="Whisper Model"),
|
||||
gr.components.Textbox(label="Custom Prompt",
|
||||
placeholder="Q: As a professional summarizer, create a concise and comprehensive summary of the provided text.\nA: Here is a detailed, bulleted list of the key points made in the transcribed video and supporting arguments:",
|
||||
lines=3),
|
||||
gr.components.Number(value=0, label="Offset"),
|
||||
gr.components.Dropdown(
|
||||
choices=["huggingface", "openai", "anthropic", "cohere", "groq", "llama", "kobold", "ooba"],
|
||||
label="API Name"),
|
||||
gr.components.Textbox(label="API Key", placeholder="Enter your API key here"),
|
||||
gr.components.Checkbox(label="VAD Filter", value=False),
|
||||
gr.components.Checkbox(label="Download Video", value=False)
|
||||
]
|
||||
|
||||
outputs = [
|
||||
gr.components.Textbox(label="Transcription"),
|
||||
gr.components.Textbox(label="Summary or Status Message"),
|
||||
gr.components.File(label="Download Transcription as JSON", visible=lambda x: x != "File not available"),
|
||||
gr.components.File(label="Download Summary as Text", visible=lambda x: x != "File not available"),
|
||||
gr.components.File(label="Download Video", visible=lambda x: x is not None)
|
||||
]
|
||||
|
||||
@@ -1290,7 +1356,7 @@ def launch_ui(demo_mode=False):
|
||||
)
|
||||
|
||||
iface.launch(share=False)
|
||||
|
||||
"""
|
||||
|
||||
#
|
||||
#
|
||||
@@ -1332,7 +1398,12 @@ def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model=
|
||||
download_path = create_download_directory(info_dict['title'])
|
||||
logging.debug("MAIN: Path created successfully")
|
||||
logging.debug("MAIN: Downloading video from yt_dlp...")
|
||||
video_path = download_video(path, download_path, info_dict, download_video_flag)
|
||||
try:
|
||||
video_path = download_video(path, download_path, info_dict, download_video_flag)
|
||||
except RuntimeError as e:
|
||||
logging.error(f"Error downloading video: {str(e)}")
|
||||
#FIXME - figure something out for handling this situation....
|
||||
continue
|
||||
logging.debug("MAIN: Video downloaded successfully")
|
||||
logging.debug("MAIN: Converting video file to WAV...")
|
||||
audio_file = convert_to_wav(video_path, offset)
|
||||
@@ -1436,7 +1507,7 @@ def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model=
|
||||
logging.error(f"Error processing path: {path}")
|
||||
logging.error(str(e))
|
||||
end_time = time.monotonic()
|
||||
#print("Total program execution time: " + timedelta(seconds=end_time - start_time))
|
||||
# print("Total program execution time: " + timedelta(seconds=end_time - start_time))
|
||||
|
||||
return results
|
||||
|
||||
@@ -1455,7 +1526,9 @@ if __name__ == "__main__":
|
||||
choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='Log level (default: INFO)')
|
||||
parser.add_argument('-ui', '--user_interface', action='store_true', help='Launch the Gradio user interface')
|
||||
parser.add_argument('-demo', '--demo_mode', action='store_true', help='Enable demo mode')
|
||||
#parser.add_argument('--log_file', action=str, help='Where to save logfile (non-default)')
|
||||
parser.add_argument('-prompt', '--custom_prompt', type=str,
|
||||
help='Pass in a custom prompt to be used in place of the existing one.(Probably should just modify the script itself...)')
|
||||
# parser.add_argument('--log_file', action=str, help='Where to save logfile (non-default)')
|
||||
args = parser.parse_args()
|
||||
|
||||
custom_prompt = args.custom_prompt
|
||||
@@ -1467,9 +1540,9 @@ if __name__ == "__main__":
|
||||
args.custom_prompt = "\n\nQ: As a professional summarizer, create a concise and comprehensive summary of the provided text.\nA: Here is a detailed, bulleted list of the key points made in the transcribed video and supporting arguments:"
|
||||
print("No custom prompt defined, will use default")
|
||||
|
||||
print(f"Is CUDA available: {torch.cuda.is_available()}")
|
||||
# print(f"Is CUDA available: {torch.cuda.is_available()}")
|
||||
# True
|
||||
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
|
||||
# print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
|
||||
# Tesla T4
|
||||
|
||||
# Since this is running in HF....
|
||||
@@ -1491,7 +1564,7 @@ if __name__ == "__main__":
|
||||
logging.info(f'Whisper model: {args.whisper_model}')
|
||||
logging.info(f'Offset: {args.offset}')
|
||||
logging.info(f'VAD filter: {args.vad_filter}')
|
||||
logging.info(f'Log Level: {args.log_level}') #lol
|
||||
logging.info(f'Log Level: {args.log_level}') # lol
|
||||
|
||||
if args.api_name and args.api_key:
|
||||
logging.info(f'API: {args.api_name}')
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
import os
|
||||
from typing import List, Tuple, Optional
|
||||
from openai import OpenAI
|
||||
import tiktoken
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
# script from: https://github.com/openai/openai-cookbook/blob/main/examples/Summarizing_long_documents.ipynb
|
||||
|
||||
|
||||
# Open dataset
|
||||
with open(".\\tldw-original-scripts\\Samples\\ai_wikipedia.txt", "r") as file:
|
||||
@@ -14,15 +15,15 @@ with open(".\\tldw-original-scripts\\Samples\\ai_wikipedia.txt", "r") as file:
|
||||
encoding = tiktoken.encoding_for_model('gpt-4-turbo')
|
||||
print(len(encoding.encode(artificial_intelligence)))
|
||||
|
||||
|
||||
# Call wrapper to OpenAI
|
||||
client = OpenAI(api_key="")
|
||||
|
||||
|
||||
def get_chat_completion(messages, model='gpt-4-turbo'):
|
||||
response = client.chat.completions.create(
|
||||
model = model,
|
||||
messages = messages,
|
||||
temperature = 0,
|
||||
model=model,
|
||||
messages=messages,
|
||||
temperature=0,
|
||||
)
|
||||
return response.choices[0].message.content
|
||||
|
||||
@@ -32,6 +33,7 @@ def tokenize(text: str) -> List[str]:
|
||||
encoding = tiktoken.encoding_for_model('gpt-4-turbo')
|
||||
return encoding.encode(text)
|
||||
|
||||
|
||||
# This function chunks a text into smaller pieces based on a maximum token count and a delimiter
|
||||
def chunk_on_delimiter(input_string: str,
|
||||
max_tokens: int,
|
||||
@@ -45,13 +47,181 @@ def chunk_on_delimiter(input_string: str,
|
||||
combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks]
|
||||
return combined_chunks
|
||||
|
||||
|
||||
# This function combines text chunks into larger blocks without exceeding a specified token count.
|
||||
# It returns the combined chunks, their original indices, and the number of dropped chunks due to overflow.
|
||||
def combine_chunks_with_no_minimum(
|
||||
chunks: List[str],
|
||||
max_tokens: int,
|
||||
chunk_delimiter: str = "\n\n",
|
||||
chunk_delimiter="\n\n",
|
||||
header: Optional[str] = None,
|
||||
add_ellipsis_for_overflow: bool = False,
|
||||
add_ellipsis_for_overflow=False,
|
||||
) -> Tuple[List[str], List[int]]:
|
||||
dropped_chunk_count = 0
|
||||
output = [] # list to hold the final combined chunks
|
||||
output_indices = [] # list to hold the indices of the final combined chunks
|
||||
candidate = (
|
||||
[] if header is None else [header]
|
||||
) # list to hold the current combined chunk candidate
|
||||
candidate_indices = []
|
||||
for chunk_i, chunk in enumerate(chunks):
|
||||
chunk_with_header = [chunk] if header is None else [header, chunk]
|
||||
if len(tokenize(chunk_delimiter.join(chunk_with_header))) > max_tokens:
|
||||
print(f"warning: chunk overflow")
|
||||
if (
|
||||
add_ellipsis_for_overflow
|
||||
and len(tokenize(chunk_delimiter.join(candidate + ["..."]))) <= max_tokens
|
||||
):
|
||||
candidate.append("...")
|
||||
dropped_chunk_count += 1
|
||||
continue # this case would break downstream assumptions
|
||||
# estimate token count with the current chunk added
|
||||
extended_candidate_token_count = len(tokenize(chunk_delimiter.join(candidate + [chunk])))
|
||||
# If the token count exceeds max_tokens, add the current candidate to output and start a new candidate
|
||||
if extended_candidate_token_count > max_tokens:
|
||||
output.append(chunk_delimiter.join(candidate))
|
||||
output_indices.append(candidate_indices)
|
||||
candidate = chunk_with_header # re-initialize candidate
|
||||
candidate_indices = [chunk_i]
|
||||
# otherwise keep extending the candidate
|
||||
else:
|
||||
candidate.append(chunk)
|
||||
candidate_indices.append(chunk_i)
|
||||
# add the remaining candidate to output if it's not empty
|
||||
if (header is not None and len(candidate) > 1) or (header is None and len(candidate) > 0):
|
||||
output.append(chunk_delimiter.join(candidate))
|
||||
output_indices.append(candidate_indices)
|
||||
return output, output_indices, dropped_chunk_count
|
||||
|
||||
|
||||
def summarize(text: str,
|
||||
detail: float = 0,
|
||||
model: str = 'gpt-4-turbo',
|
||||
additional_instructions: Optional[str] = None,
|
||||
minimum_chunk_size: Optional[int] = 500,
|
||||
chunk_delimiter: str = ".",
|
||||
summarize_recursively=False,
|
||||
verbose=False):
|
||||
"""
|
||||
Summarizes a given text by splitting it into chunks, each of which is summarized individually.
|
||||
The level of detail in the summary can be adjusted, and the process can optionally be made recursive.
|
||||
|
||||
Parameters: - text (str): The text to be summarized. - detail (float, optional): A value between 0 and 1
|
||||
indicating the desired level of detail in the summary. 0 leads to a higher level summary, and 1 results in a more
|
||||
detailed summary. Defaults to 0. - model (str, optional): The model to use for generating summaries. Defaults to
|
||||
'gpt-3.5-turbo'. - additional_instructions (Optional[str], optional): Additional instructions to provide to the
|
||||
model for customizing summaries. - minimum_chunk_size (Optional[int], optional): The minimum size for text
|
||||
chunks. Defaults to 500. - chunk_delimiter (str, optional): The delimiter used to split the text into chunks.
|
||||
Defaults to ".". - summarize_recursively (bool, optional): If True, summaries are generated recursively,
|
||||
using previous summaries for context. - verbose (bool, optional): If True, prints detailed information about the
|
||||
chunking process.
|
||||
|
||||
Returns:
|
||||
- str: The final compiled summary of the text.
|
||||
|
||||
The function first determines the number of chunks by interpolating between a minimum and a maximum chunk count
|
||||
based on the `detail` parameter. It then splits the text into chunks and summarizes each chunk. If
|
||||
`summarize_recursively` is True, each summary is based on the previous summaries, adding more context to the
|
||||
summarization process. The function returns a compiled summary of all chunks.
|
||||
"""
|
||||
|
||||
# check detail is set correctly
|
||||
assert 0 <= detail <= 1
|
||||
|
||||
# interpolate the number of chunks based to get specified level of detail
|
||||
max_chunks = len(chunk_on_delimiter(text, minimum_chunk_size, chunk_delimiter))
|
||||
min_chunks = 1
|
||||
num_chunks = int(min_chunks + detail * (max_chunks - min_chunks))
|
||||
|
||||
# adjust chunk_size based on interpolated number of chunks
|
||||
document_length = len(tokenize(text))
|
||||
chunk_size = max(minimum_chunk_size, document_length // num_chunks)
|
||||
text_chunks = chunk_on_delimiter(text, chunk_size, chunk_delimiter)
|
||||
if verbose:
|
||||
print(f"Splitting the text into {len(text_chunks)} chunks to be summarized.")
|
||||
print(f"Chunk lengths are {[len(tokenize(x)) for x in text_chunks]}")
|
||||
|
||||
# set system message
|
||||
system_message_content = "Rewrite this text in summarized form."
|
||||
if additional_instructions is not None:
|
||||
system_message_content += f"\n\n{additional_instructions}"
|
||||
|
||||
accumulated_summaries = []
|
||||
for chunk in tqdm(text_chunks):
|
||||
if summarize_recursively and accumulated_summaries:
|
||||
# Creating a structured prompt for recursive summarization
|
||||
accumulated_summaries_string = '\n\n'.join(accumulated_summaries)
|
||||
user_message_content = f"Previous summaries:\n\n{accumulated_summaries_string}\n\nText to summarize next:\n\n{chunk}"
|
||||
else:
|
||||
# Directly passing the chunk for summarization without recursive context
|
||||
user_message_content = chunk
|
||||
|
||||
# Constructing messages based on whether recursive summarization is applied
|
||||
messages = [
|
||||
{"role": "system", "content": system_message_content},
|
||||
{"role": "user", "content": user_message_content}
|
||||
]
|
||||
|
||||
# Assuming this function gets the completion and works as expected
|
||||
response = get_chat_completion(messages, model=model)
|
||||
accumulated_summaries.append(response)
|
||||
|
||||
# Compile final summary from partial summaries
|
||||
final_summary = '\n\n'.join(accumulated_summaries)
|
||||
|
||||
return final_summary
|
||||
|
||||
# Summary at 0 detail
|
||||
summary_with_detail_0 = summarize(artificial_intelligence, detail=0, verbose=True)
|
||||
|
||||
|
||||
# Summary at 0.25 detail
|
||||
summary_with_detail_pt25 = summarize(artificial_intelligence, detail=0.25, verbose=True)
|
||||
|
||||
|
||||
# Summary at 0.5 detail
|
||||
summary_with_detail_pt5 = summarize(artificial_intelligence, detail=0.5, verbose=True)
|
||||
|
||||
|
||||
# Summary at 0.75 detail
|
||||
summary_with_detail_pt75 = summarize(artificial_intelligence, detail=0.75, verbose=True)
|
||||
|
||||
|
||||
# Summart at 1 detail
|
||||
summary_with_detail_1 = summarize(artificial_intelligence, detail=1, verbose=True)
|
||||
|
||||
|
||||
# Lengths of summaries:
|
||||
[len(tokenize(x)) for x in
|
||||
[summary_with_detail_0, summary_with_detail_pt25, summary_with_detail_pt5, summary_with_detail_pt75, summary_with_detail_1]]
|
||||
|
||||
# print 0 detail summary
|
||||
print(summary_with_detail_0)
|
||||
|
||||
|
||||
# print 0.25 detail summary
|
||||
print(summary_with_detail_pt25)
|
||||
|
||||
|
||||
# print 0.5 detail summary
|
||||
print(summary_with_detail_pt5)
|
||||
|
||||
|
||||
# print 0.75 detail summary
|
||||
print(summary_with_detail_pt75)
|
||||
|
||||
|
||||
# print 1.0 detail summary
|
||||
print(summary_with_detail_1)
|
||||
|
||||
|
||||
# Print summary using additional instructions:
|
||||
summary_with_additional_instructions = summarize(artificial_intelligence_wikipedia_text, detail=0.1,
|
||||
additional_instructions="Write in point form and focus on numerical data.")
|
||||
print(summary_with_additional_instructions)
|
||||
|
||||
|
||||
# Print summary using recursive summarization:
|
||||
recursive_summary = summarize(artificial_intelligence_wikipedia_text, detail=0.1, summarize_recursively=True)
|
||||
print(recursive_summary)
|
||||
|
||||
|
||||
@@ -500,8 +500,8 @@ def download_video(video_url, download_path, info_dict, download_video_flag):
|
||||
]
|
||||
subprocess.run(ffmpeg_command, check=True)
|
||||
else:
|
||||
logging.error("You shouldn't be here...")
|
||||
exit()
|
||||
logging.error("ffmpeg: Unsupported operating system for video download and merging.")
|
||||
raise RuntimeError("ffmpeg: Unsupported operating system for video download and merging.")
|
||||
os.remove(video_file_path)
|
||||
os.remove(audio_file_path)
|
||||
|
||||
@@ -533,6 +533,7 @@ def convert_to_wav(video_file_path, offset=0):
|
||||
|
||||
if sys.platform.startswith('win'):
|
||||
ffmpeg_cmd = ".\\Bin\\ffmpeg.exe"
|
||||
logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}")
|
||||
else:
|
||||
ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
|
||||
|
||||
|
||||
Reference in New Issue
Block a user