From 0dbd739eca6d29700f7036157003ab23f0b234f0 Mon Sep 17 00:00:00 2001
From: Robert <contact@rmusser.net>
Date: Fri, 10 May 2024 16:11:18 -0700
Subject: [PATCH] First stab at custom summarization prompts

---
 HF/app.py    | 64 ++++++++++++++++++++-------------------
 summarize.py | 85 ++++++++++++++++++++++++++++++----------------------
 2 files changed, 83 insertions(+), 66 deletions(-)
diff --git a/HF/app.py b/HF/app.py
index 0c7541a..b9076b4 100644
--- a/HF/app.py
+++ b/HF/app.py
@@ -45,23 +45,22 @@ import yt_dlp
 #
 #
 # Usage:
-#          Transcribe a single URL: 
-#                python diarize.py https://example.com/video.mp4
 #
-#          Transcribe a single URL and have the resulting transcription summarized: 
-#                python diarize.py https://example.com/video.mp4
+#   Download Audio only from URL -> Transcribe audio:
+#       python summarize.py https://www.youtube.com/watch?v=4nd1CDZP21s`
 #
-#          Transcribe a list of files:
-#               python diarize.py ./path/to/your/text_file.txt
+#   Download Audio+Video from URL -> Transcribe audio from Video:**
+#       python summarize.py -v https://www.youtube.com/watch?v=4nd1CDZP21s`
 #
-#          Transcribe a local file:
-#               python diarize.py /path/to/your/localfile.mp4
-#
-#          Transcribe a local file and have it summarized:
-#               python diarize.py ./input.mp4 --api_name openai --api_key <your_openai_api_key>
-#
-#          Transcribe a list of files and have them all summarized:
-#               python diarize.py path_to_your_text_file.txt --api_name <openai> --api_key <your_openai_api_key>
+#   Download Audio only from URL -> Transcribe audio -> Summarize using (`anthropic`/`cohere`/`openai`/`llama` (llama.cpp)/`ooba` (oobabooga/text-gen-webui)/`kobold` (kobold.cpp)/`tabby` (Tabbyapi)) API:**
+#       python summarize.py -v https://www.youtube.com/watch?v=4nd1CDZP21s -api <your choice of API>` - Make sure to put your API key into `config.txt` under the appropriate API variable
+# 
+#   Download Audio+Video from a list of videos in a text file (can be file paths or URLs) and have them all summarized:**
+#       python summarize.py ./local/file_on_your/system --api_name <API_name>`
+# 
+#   Run it as a WebApp**
+#       python summarize.py -gui` - This requires you to either stuff your API keys into the `config.txt` file, or pass them into the app every time you want to use it.
+#           Can be helpful for setting up a shared instance, but not wanting people to perform inference on your server.
 #
 ###
 
@@ -350,7 +349,7 @@ def process_local_file(file_path):
 # Video Download/Handling
 #
 
-def process_url(input_path, num_speakers=2, whisper_model="small.en", offset=0, api_name=None, api_key=None, vad_filter=False, download_video_flag=False, demo_mode=True):
+def process_url(input_path, num_speakers=2, whisper_model="small.en", offset=0, api_name=None, api_key=None, vad_filter=False, download_video_flag=False,custom_prompt=None, demo_mode=True):
     if demo_mode:
         api_name = "huggingface"
         api_key = os.environ.get(HF_TOKEN)
@@ -793,7 +792,7 @@ def summarize_with_openai(api_key, file_path, model):
         }
         
         logging.debug("openai: Preparing data + prompt for submittal")
-        prompt_text = f"{text} \n\n\n\nPlease provide a detailed, bulleted list of the points made throughout the transcribed video and any supporting arguments made for said points"
+        openai_prompt = f"{text} \n\n\n\n{prompt_text}"
         data = {
             "model": model,
             "messages": [
@@ -803,7 +802,7 @@ def summarize_with_openai(api_key, file_path, model):
                 },
                 {
                     "role": "user",
-                    "content": prompt_text
+                    "content": openai_prompt
                 }
             ],
             "max_tokens": 4096,  # Adjust tokens as needed
@@ -846,7 +845,7 @@ def summarize_with_claude(api_key, file_path, model):
         logging.debug("anthropic: Prepping data + prompt for submittal")
         user_message = {
             "role": "user",
-            "content": f"{text} \n\n\n\nPlease provide a detailed, bulleted list of the points made throughout the transcribed video and any supporting arguments made for said points"
+            "content": f"{text} \n\n\n\n{prompt_text}"
         }
 
         data = {
@@ -913,10 +912,10 @@ def summarize_with_cohere(api_key, file_path, model):
             'Authorization': f'Bearer {api_key}'
         }
 
-        prompt_text = f"{text} \n\nAs a professional summarizer, create a concise and comprehensive summary of the provided text."
+        cohere_prompt = f"{text} \n\n\n\n{prompt_text}"
         data = {
             "chat_history": [
-                {"role": "USER", "message": prompt_text}
+                {"role": "USER", "message": cohere_prompt}
             ],
             "message": "Please provide a summary.",
             "model": model,
@@ -964,12 +963,12 @@ def summarize_with_groq(api_key, file_path, model):
             'Content-Type': 'application/json'
         }
 
-        prompt_text = f"{text} \n\nAs a professional summarizer, create a concise and comprehensive summary of the provided text."
+        groq_prompt = f"{text} \n\n\n\n{prompt_text}"
         data = {
             "messages": [
                 {
                     "role": "user",
-                    "content": prompt_text
+                    "content": groq_prompt
                 }
             ],
             "model": model
@@ -1021,12 +1020,13 @@ def summarize_with_llama(api_url, file_path, token):
             headers['Authorization'] = f'Bearer {token}'
 
 
-        prompt_text = f"{text} \n\nAs a professional summarizer, create a concise and comprehensive summary of the provided text."
+        llama_prompt = f"{text} \n\n\n\n{prompt_text}"
+        logging.debug(f"llama: Complete prompt is: {llama_prompt}")
         data = {
-            "prompt": prompt_text
+            "prompt": llama_prompt
         }
 
-        logging.debug("llama: Submitting request to API endpoint")
+        #logging.debug(f"llama: Submitting request to API endpoint {llama_prompt}")
         print("llama: Submitting request to API endpoint")
         response = requests.post(api_url, headers=headers, json=data)
         response_data = response.json()
@@ -1064,13 +1064,13 @@ def summarize_with_kobold(api_url, file_path):
             'content-type': 'application/json',
         }
         # FIXME
-        prompt_text = f"{text} \n\nAs a professional summarizer, create a concise and comprehensive summary of the above text."
-        logging.debug(prompt_text)
+        kobold_prompt = f"{text} \n\n\n\n{prompt_text}"
+        logging.debug(kobold_prompt)
         # Values literally c/p from the api docs....
         data = {
             "max_context_length": 8096,
             "max_length": 4096,
-            "prompt": prompt_text,
+            "prompt": kobold_prompt,
         }
 
         logging.debug("kobold: Submitting request to API endpoint")
@@ -1114,9 +1114,9 @@ def summarize_with_oobabooga(api_url, file_path):
             'content-type': 'application/json',
         }
 
-        prompt_text = "I like to eat cake and bake cakes. I am a baker. I work in a french bakery baking cakes. It is a fun job. I have been baking cakes for ten years. I also bake lots of other baked goods, but cakes are my favorite."
-        # prompt_text += f"\n\n{text}"  # Uncomment this line if you want to include the text variable
-        prompt_text += "\n\nAs a professional summarizer, create a concise and comprehensive summary of the provided text."
+        #prompt_text = "I like to eat cake and bake cakes. I am a baker. I work in a french bakery baking cakes. It is a fun job. I have been baking cakes for ten years. I also bake lots of other baked goods, but cakes are my favorite."
+        #prompt_text += f"\n\n{text}"  # Uncomment this line if you want to include the text variable
+        ooba_prompt = f"{text}\n\n\n\n{prompt_text}"
 
         data =  {
             "mode": "chat",
@@ -1268,6 +1268,7 @@ def launch_ui(demo_mode=False):
         gr.components.Textbox(label="URL of video to be Transcribed/Summarized"),
         gr.components.Number(value=2, label="Number of Speakers (for Diarization)"),
         gr.components.Dropdown(choices=whisper_models, value="small.en", label="Whisper Model (Can ignore this)"),
+        gr.components.Textbox(label="Custom Prompt", value="Please provide a detailed, bulleted list of the points made throughout the transcribed video and any supporting arguments made for said points", lines=3),
         gr.components.Number(value=0, label="Offset time to start transcribing from\n\n (helpful if you only want part of a video/lecture)")
     ]
 
@@ -1316,6 +1317,7 @@ def launch_ui(demo_mode=False):
 ####################################################################################################################################
 # Main()
 #
+
 def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model="small.en", offset=0, vad_filter=False, download_video_flag=False, demo_mode=False):
     if input_path is None and args.user_interface:
         return []
diff --git a/summarize.py b/summarize.py
index fd5305d..957656f 100644
--- a/summarize.py
+++ b/summarize.py
@@ -36,23 +36,22 @@ import yt_dlp
 #
 #
 # Usage:
-#          Transcribe a single URL: 
-#                python diarize.py https://example.com/video.mp4
 #
-#          Transcribe a single URL and have the resulting transcription summarized: 
-#                python diarize.py https://example.com/video.mp4
+#   Download Audio only from URL -> Transcribe audio:
+#       python summarize.py https://www.youtube.com/watch?v=4nd1CDZP21s`
 #
-#          Transcribe a list of files:
-#               python diarize.py ./path/to/your/text_file.txt
+#   Download Audio+Video from URL -> Transcribe audio from Video:**
+#       python summarize.py -v https://www.youtube.com/watch?v=4nd1CDZP21s`
 #
-#          Transcribe a local file:
-#               python diarize.py /path/to/your/localfile.mp4
-#
-#          Transcribe a local file and have it summarized:
-#               python diarize.py ./input.mp4 --api_name openai --api_key <your_openai_api_key>
-#
-#          Transcribe a list of files and have them all summarized:
-#               python diarize.py path_to_your_text_file.txt --api_name <openai> --api_key <your_openai_api_key>
+#   Download Audio only from URL -> Transcribe audio -> Summarize using (`anthropic`/`cohere`/`openai`/`llama` (llama.cpp)/`ooba` (oobabooga/text-gen-webui)/`kobold` (kobold.cpp)/`tabby` (Tabbyapi)) API:**
+#       python summarize.py -v https://www.youtube.com/watch?v=4nd1CDZP21s -api <your choice of API>` - Make sure to put your API key into `config.txt` under the appropriate API variable
+# 
+#   Download Audio+Video from a list of videos in a text file (can be file paths or URLs) and have them all summarized:**
+#       python summarize.py ./local/file_on_your/system --api_name <API_name>`
+# 
+#   Run it as a WebApp**
+#       python summarize.py -gui` - This requires you to either stuff your API keys into the `config.txt` file, or pass them into the app every time you want to use it.
+#           Can be helpful for setting up a shared instance, but not wanting people to perform inference on your server.
 #
 ###
 
@@ -341,7 +340,7 @@ def process_local_file(file_path):
 # Video Download/Handling
 #
 
-def process_url(input_path, num_speakers=2, whisper_model="small.en", offset=0, api_name=None, api_key=None, vad_filter=False, download_video_flag=False, demo_mode=False):
+def process_url(input_path, num_speakers=2, whisper_model="small.en", custom_prompt=None, offset=0, api_name=None, api_key=None, vad_filter=False, download_video_flag=False, demo_mode=False):
     if demo_mode:
 #        api_name = "<demo_mode_api>"
 #        api_key = "<demo_mode_key>"
@@ -782,7 +781,7 @@ def summarize_with_openai(api_key, file_path, model):
         }
         
         logging.debug("openai: Preparing data + prompt for submittal")
-        prompt_text = f"{text} \n\n\n\nPlease provide a detailed, bulleted list of the points made throughout the transcribed video and any supporting arguments made for said points"
+        openai_prompt = f"{text} \n\n\n\n{custom_prompt}"
         data = {
             "model": model,
             "messages": [
@@ -792,7 +791,7 @@ def summarize_with_openai(api_key, file_path, model):
                 },
                 {
                     "role": "user",
-                    "content": prompt_text
+                    "content": openai_prompt
                 }
             ],
             "max_tokens": 4096,  # Adjust tokens as needed
@@ -831,11 +830,12 @@ def summarize_with_claude(api_key, file_path, model):
             'anthropic-version': '2023-06-01',
             'Content-Type': 'application/json'
         }
-        
-        logging.debug("anthropic: Prepping data + prompt for submittal")
+
+        anthropic_prompt = custom_prompt
+        logging.debug("anthropic: Prompt is {anthropic_prompt}")
         user_message = {
             "role": "user",
-            "content": f"{text} \n\n\n\nPlease provide a detailed, bulleted list of the points made throughout the transcribed video and any supporting arguments made for said points"
+            "content": f"{text} \n\n\n\n{anthropic_prompt}"
         }
 
         data = {
@@ -902,10 +902,12 @@ def summarize_with_cohere(api_key, file_path, model):
             'Authorization': f'Bearer {api_key}'
         }
 
-        prompt_text = f"{text} \n\nAs a professional summarizer, create a concise and comprehensive summary of the provided text."
+        cohere_prompt = f"{text} \n\n\n\n{custom_prompt}"
+        logging.debug("cohere: Prompt being sent is {cohere_prompt}")
+
         data = {
             "chat_history": [
-                {"role": "USER", "message": prompt_text}
+                {"role": "USER", "message": cohere_prompt}
             ],
             "message": "Please provide a summary.",
             "model": model,
@@ -953,12 +955,14 @@ def summarize_with_groq(api_key, file_path, model):
             'Content-Type': 'application/json'
         }
 
-        prompt_text = f"{text} \n\nAs a professional summarizer, create a concise and comprehensive summary of the provided text."
+        groq_prompt = f"{text} \n\n\n\n{custom_prompt}"
+        logging.debug("groq: Prompt being sent is {groq_prompt}")
+
         data = {
             "messages": [
                 {
                     "role": "user",
-                    "content": prompt_text
+                    "content": groq_prompt
                 }
             ],
             "model": model
@@ -1010,9 +1014,11 @@ def summarize_with_llama(api_url, file_path, token):
             headers['Authorization'] = f'Bearer {token}'
 
 
-        prompt_text = f"{text} \n\nAs a professional summarizer, create a concise and comprehensive summary of the provided text."
+        llama_prompt = f"{text} \n\n\n\n{custom_prompt}"
+        logging.debug("llama: Prompt being sent is {llama_prompt}")
+
         data = {
-            "prompt": prompt_text
+            "prompt": llama_prompt
         }
 
         logging.debug("llama: Submitting request to API endpoint")
@@ -1052,14 +1058,16 @@ def summarize_with_kobold(api_url, file_path):
             'accept': 'application/json',
             'content-type': 'application/json',
         }
+
+        kobold_prompt = f"{text} \n\n\n\n{custom_prompt}"
+        logging.debug("kobold: Prompt being sent is {kobold_prompt}")
+
         # FIXME
-        prompt_text = f"{text} \n\nAs a professional summarizer, create a concise and comprehensive summary of the above text."
-        logging.debug(prompt_text)
         # Values literally c/p from the api docs....
         data = {
             "max_context_length": 8096,
             "max_length": 4096,
-            "prompt": prompt_text,
+            "prompt": kobold_prompt,
         }
 
         logging.debug("kobold: Submitting request to API endpoint")
@@ -1103,14 +1111,15 @@ def summarize_with_oobabooga(api_url, file_path):
             'content-type': 'application/json',
         }
 
-        prompt_text = "I like to eat cake and bake cakes. I am a baker. I work in a french bakery baking cakes. It is a fun job. I have been baking cakes for ten years. I also bake lots of other baked goods, but cakes are my favorite."
+        # prompt_text = "I like to eat cake and bake cakes. I am a baker. I work in a french bakery baking cakes. It is a fun job. I have been baking cakes for ten years. I also bake lots of other baked goods, but cakes are my favorite."
         # prompt_text += f"\n\n{text}"  # Uncomment this line if you want to include the text variable
-        prompt_text += "\n\nAs a professional summarizer, create a concise and comprehensive summary of the provided text."
+        ooba_prompt = "{text}\n\n\n\n{custom_prompt}"
+        logging.debug("ooba: Prompt being sent is {ooba_prompt}")
 
         data =  {
             "mode": "chat",
             "character": "Example",
-            "messages": [{"role": "user", "content": prompt_text}]
+            "messages": [{"role": "user", "content": ooba_prompt}]
         }
 
         logging.debug("ooba: Submitting request to API endpoint")
@@ -1166,7 +1175,6 @@ def summarize_with_huggingface(api_key, file_path):
         logging.debug(f"huggingface: Segments: {segments}")
         text = ' '.join([segment['text'] for segment in segments])
 
-
 # API KEY ASSIGNMENT HERE
         api_key = huggingface_api_key
         print(f"huggingface: lets make sure the HF api key exists...\n\t {huggingface_api_key}" )
@@ -1176,6 +1184,10 @@ def summarize_with_huggingface(api_key, file_path):
 
         model = "microsoft/Phi-3-mini-128k-instruct"
         API_URL = f"https://api-inference.huggingface.co/models/{model}"
+
+
+        huggingface_prompt = "{text}\n\n\n\n{custom_prompt}"
+        logging.debug("huggingface: Prompt being sent is {huggingface_prompt}")
         data = {
             "inputs": text,
             "parameters": {"max_length": 512, "min_length": 100}  # You can adjust max_length and min_length as needed
@@ -1184,6 +1196,7 @@ def summarize_with_huggingface(api_key, file_path):
         print(f"huggingface: lets make sure the HF api key is the same..\n\t {huggingface_api_key}")
 
         logging.debug("huggingface: Submitting request...")
+
         response = requests.post(API_URL, headers=headers, json=data)
         
         if response.status_code == 200:
@@ -1217,6 +1230,7 @@ def launch_ui(demo_mode=False):
         gr.components.Textbox(label="URL"),
         gr.components.Number(value=2, label="Number of Speakers"),
         gr.components.Dropdown(choices=whisper_models, value="small.en", label="Whisper Model"),
+        gr.components.Textbox(label="Custom Prompt", value="Please provide a detailed, bulleted list of the points made throughout the transcribed video and any supporting arguments made for said points", lines=3),
         gr.components.Number(value=0, label="Offset")
     ]
 
@@ -1245,7 +1259,8 @@ def launch_ui(demo_mode=False):
         theme="bethecloud/storj_theme"
     )
 
-    iface.launch(share=True)
+    #iface.launch(share=True)
+    iface.launch(share=False)
 
 #
 #
@@ -1260,7 +1275,7 @@ def launch_ui(demo_mode=False):
 ####################################################################################################################################
 # Main()
 #
-def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model="small.en", offset=0, vad_filter=False, download_video_flag=False):
+def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model="small.en", offset=0, vad_filter=False, download_video_flag=False, custom_prompt=None):
     if input_path is None and args.user_interface:
         return []
     start_time = time.monotonic()