Trudging

2026-03-10 08:51:17 +00:00 · 2024-05-10 13:28:23 -07:00
parent 4ca327042d
commit ecc3630dad
4 changed files with 161 additions and 80 deletions
--- a/README.md
+++ b/README.md
@@ -36,10 +36,12 @@ GUI
 - **Download Audio+Video from URL -> Transcribe audio from Video:**
  * `python summarize.py -v https://www.youtube.com/watch?v=4nd1CDZP21s`
 - **Download Audio only from URL -> Transcribe audio -> Summarize using (`anthropic`/`cohere`/`openai`/`llama` (llama.cpp)/`ooba` (oobabooga/text-gen-webui)/`kobold` (kobold.cpp)/`tabby` (Tabbyapi)) API:**
-  * `python summarize.py -v https://www.youtube.com/watch?v=4nd1CDZP21s -api <your choice of API>`
+  * `python summarize.py -v https://www.youtube.com/watch?v=4nd1CDZP21s -api <your choice of API>` - Make sure to put your API key into `config.txt` under the appropriate API variable
 - **Download Audio+Video from a list of videos in a text file (can be file paths or URLs) and have them all summarized:**
  * `python summarize.py ./local/file_on_your/system --api_name <API_name>`
-
+- **Run it as a WebApp**
+  * `python summarize.py -gui` - This requires you to either stuff your API keys into the `config.txt` file, or pass them into the app every time you want to use it.
+    * Can be helpful for setting up a shared instance, but not wanting people to perform inference on your server.


 ### <a name="what"></a>What?
@@ -112,9 +114,9 @@ GUI

 Save time and use the `config.txt` file, it allows you to set these settings and have them used when ran.
 ```
-usage: summarize.py [-h] [--api_name API_NAME] [--api_key API_KEY] [--num_speakers NUM_SPEAKERS] [--whisper_model WHISPER_MODEL] [--offset OFFSET]
-                  [--vad_filter] [--log_level {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
-                  [input_path]
+usage: summarize.py [-h] [-v] [-api API_NAME] [-ns NUM_SPEAKERS] [-wm WHISPER_MODEL] [-off OFFSET] [-vad]
+                    [-log {DEBUG,INFO,WARNING,ERROR,CRITICAL}] [-ui] [-demo]
+                    [input_path]

 Transcribe and summarize videos.

@@ -126,8 +128,6 @@ options:
  -v, --video           Download the video instead of just the audio
  -api API_NAME, --api_name API_NAME
                        API name for summarization (optional)
-  -key API_KEY, --api_key API_KEY
-                        API key for summarization (optional)
  -ns NUM_SPEAKERS, --num_speakers NUM_SPEAKERS
                        Number of speakers (default: 2)
  -wm WHISPER_MODEL, --whisper_model WHISPER_MODEL
--- a/Tests/HF/app.py
+++ b/Tests/HF/app.py
@@ -24,13 +24,13 @@ import yt_dlp

 # To Do
 # Offline diarization - https://github.com/pyannote/pyannote-audio/blob/develop/tutorials/community/offline_usage_speaker_diarization.ipynb
+# Dark mode changes under gradio
 #
 # Changes made to app.py version:
 # 1. Removal of video files after conversion -> check main function
 # 2. Usage of/Hardcoding HF_TOKEN as token for API calls
 # 3. Usage of HuggingFace for Inference
 # 4. Other stuff I can't remember. Will eventually do a diff and document them.
-# 5. Dark mode changes under gradio
 # 


@@ -76,17 +76,27 @@ config.read('config.txt')

 # API Keys
 anthropic_api_key = config.get('API', 'anthropic_api_key', fallback=None)
+logging.debug(f"Loaded Anthropic API Key: {anthropic_api_key}")
+
 cohere_api_key = config.get('API', 'cohere_api_key', fallback=None)
+logging.debug(f"Loaded cohere API Key: {cohere_api_key}")
+
 groq_api_key = config.get('API', 'groq_api_key', fallback=None)
+logging.debug(f"Loaded groq API Key: {groq_api_key}")
+
 openai_api_key = config.get('API', 'openai_api_key', fallback=None)
+logging.debug(f"Loaded openAI Face API Key: {openai_api_key}")
+
 huggingface_api_key = config.get('API', 'huggingface_api_key', fallback=None)
+logging.debug(f"Loaded HuggingFace Face API Key: {huggingface_api_key}")
+

 # Models
 anthropic_model = config.get('API', 'anthropic_model', fallback='claude-3-sonnet-20240229')
 cohere_model = config.get('API', 'cohere_model', fallback='command-r-plus')
 groq_model = config.get('API', 'groq_model', fallback='FIXME')
 openai_model = config.get('API', 'openai_model', fallback='gpt-4-turbo')
-huggingface_model = config.get('API', 'huggingface_model', fallback='microsoft/Phi-3-mini-128k-instruct')
+huggingface_model = config.get('API', 'huggingface_model', fallback='CohereForAI/c4ai-command-r-plus')

 # Local-Models
 kobold_api_IP = config.get('Local-API', 'kobold_api_IP', fallback='http://127.0.0.1:5000/api/v1/generate')
@@ -340,10 +350,11 @@ def process_local_file(file_path):
 # Video Download/Handling
 #

-def process_url(input_path, num_speakers=2, whisper_model="small.en", offset=0, api_name=None, api_key=None, vad_filter=False, download_video_flag=False, demo_mode=False):
+def process_url(input_path, num_speakers=2, whisper_model="small.en", offset=0, api_name=None, api_key=None, vad_filter=False, download_video_flag=False, demo_mode=True):
    if demo_mode:
        api_name = "huggingface"
-        api_key = os.environ.get("HF_TOKEN")
+        api_key = os.environ.get(HF_TOKEN)
+        print("HUGGINGFACE API KEY CHECK #3: " + api_key)
        vad_filter = False
        download_video_flag = False
    
@@ -353,16 +364,20 @@ def process_url(input_path, num_speakers=2, whisper_model="small.en", offset=0,
        if results:
            transcription_result = results[0]
            json_file_path = transcription_result['audio_file'].replace('.wav', '.segments.json')
+
            with open(json_file_path, 'r') as file:
                json_data = json.load(file)
-            
            summary_file_path = json_file_path.replace('.segments.json', '_summary.txt')
+
            if os.path.exists(summary_file_path):
                return json_data, summary_file_path, json_file_path, summary_file_path
+
            else:
                return json_data, "Summary not available.", json_file_path, None
+
        else:
            return None, "No results found.", None, None
+
    except Exception as e:
        error_message = f"An error occurred: {str(e)}"
        return None, error_message, None, None
@@ -755,10 +770,10 @@ def speaker_diarize(video_file_path, segments, embedding_model = "pyannote/embed
 #
 #

-# Summarize with OpenAI ChatGPT
 def extract_text_from_segments(segments):
-    logging.debug(f"openai: extracting text from {segments}")
+    logging.debug(f"Main: extracting text from {segments}")
    text = ' '.join([segment['text'] for segment in segments])
+    logging.debug(f"Main: Successfully extracted text from {segments}")
    return text


@@ -1153,6 +1168,38 @@ def save_summary_to_file(summary, file_path):
 # Only to be used when configured with Gradio for HF Space
 def summarize_with_huggingface(api_key, file_path):
    logging.debug(f"huggingface: Summarization process starting...")
+
+    model = "microsoft/Phi-3-mini-128k-instruct"
+    API_URL = f"https://api-inference.huggingface.co/models/{model}"
+    headers = {"Authorization": f"Bearer {api_key}"}
+
+    with open(file_path, 'r') as file:
+        segments = json.load(file)
+    text = ''.join([segment['text'] for segment in segments])
+
+    # FIXME adjust max_length and min_length as needed
+    data = {
+        "inputs": text,
+        "parameters": {"max_length": 4096, "min_length": 100}
+    }
+
+    max_retries = 5
+
+    for attempt in range(max_retries):
+        response = requests.post(API_URL, headers=headers, json=data)
+        if response.status_code == 200:
+            summary = response.json()[0]['summary_text']
+            return summary, None
+        elif response.status_code == 503:
+            response_data = response.json()
+            wait_time = response_data.get('estimated_time', 10)
+            return None, f"Model is loading, retrying in {int(wait_time)} seconds..."
+        # Sleep before retrying....
+        time.sleep(wait_time)
+
+    if api_key == "":
+        api_key = os.environ.get(HF_TOKEN)
+        logging.debug("HUGGINGFACE API KEY CHECK: " + api_key)
    try:
        logging.debug("huggingface: Loading json data for summarization")
        with open(file_path, 'r') as file:
@@ -1161,16 +1208,11 @@ def summarize_with_huggingface(api_key, file_path):
        logging.debug("huggingface: Extracting text from the segments")
        text = ' '.join([segment['text'] for segment in segments])

-        api_key = os.environ.get('HF_TOKEN')
-        headers = {
-            "Authorization": f"Bearer {api_key}"
-        }
-        model = "microsoft/Phi-3-mini-128k-instruct"
-        API_URL = f"https://api-inference.huggingface.co/models/{model}"
-        data = {
-            "inputs": text,
-            "parameters": {"max_length": 512, "min_length": 100}  # You can adjust max_length and min_length as needed
-        }
+        api_key = os.environ.get(HF_TOKEN)
+        logging.debug("HUGGINGFACE API KEY CHECK #2: " + api_key)
+
+
+
        
        logging.debug("huggingface: Submitting request...")
        response = requests.post(API_URL, headers=headers, json=data)
@@ -1195,56 +1237,65 @@ def summarize_with_huggingface(api_key, file_path):



+def format_transcription(transcription_result):
+    if transcription_result:
+        json_data = transcription_result['transcription']
+        return json.dumps(json_data, indent=2)
+    else:
+        return ""
+
+
+
+def process_text(api_key,text_file):
+    summary,message = summarize_with_huggingface(api_key,text_file)
+    if summary:
+        # Show summary on success
+        return "Summary:",summary 
+    else:
+        # Inform user about load/wait time
+        return "Notice:",message
+
+
 def launch_ui(demo_mode=False):
    def process_transcription(json_data):
        if json_data:
-            return "\n".join([item["text"] for item in json_data])
+            return json.dumps(json_data, indent=2)
+            #return "\n".join([item["text"] for item in json_data])
        else:
            return ""
-    with gr.Blocks(theme='bethecloud/storj_theme') as demo:
-        with gr.Column(scale=3):
-            with gr.Box():
-                dropdown.render()
-                toggle_dark = gr.Button(value="Toggle Dark").style(full_width=True)
-    dropdown.change(None, dropdown, None, _js=js)
-    toggle_dark.click(
-        None,
-        _js="""
-        () => {
-            document.body.classList.toggle('dark');
-            document.querySelector('gradio-app').style.backgroundColor = 'var(--color-background-primary)'
-        }
-        """,
-    )
-    
+
    inputs = [
-        gr.components.Textbox(label="URL"),
-        gr.components.Number(value=2, label="Number of Speakers"),
-        gr.components.Dropdown(choices=whisper_models, value="small.en", label="Whisper Model"),
-        gr.components.Number(value=0, label="Offset")
+        gr.components.Textbox(label="URL of video to be Transcribed/Summarized"),
+        gr.components.Number(value=2, label="Number of Speakers (for Diarization)"),
+        gr.components.Dropdown(choices=whisper_models, value="small.en", label="Whisper Model (Can ignore this)"),
+        gr.components.Number(value=0, label="Offset time to start transcribing from\n\n (helpful if you only want part of a video/lecture)")
    ]

    if not demo_mode:
        inputs.extend([
-            gr.components.Dropdown(choices=["huggingface", "openai", "anthropic", "cohere", "groq", "llama", "kobold", "ooba"], value="anthropic", label="API Name"),
-            gr.components.Textbox(label="API Key"),
-            gr.components.Checkbox(value=False, label="VAD Filter"),
-            gr.components.Checkbox(value=False, label="Download Video")
+            gr.components.Dropdown(choices=["huggingface", "openai", "anthropic", "cohere", "groq", "llama", "kobold", "ooba"], value="huggingface", label="API Name - What LLM service will summarize your transcription"),
+            gr.components.Textbox(label="API Key - Have to provide one, unless you're fine waiting on HuggingFace..."),
+#            gr.components.Checkbox(value=False, label="Download Video"),
+#            gr.components.Checkbox(value=False, label="VAD Filter")
        ])

    iface = gr.Interface(
+#        fn=lambda url, num_speakers, whisper_model, offset, api_name, api_key: process_url(url, num_speakers, whisper_model, offset, api_name=api_name, api_key=api_key, demo_mode=demo_mode),
        fn=lambda *args: process_url(*args, demo_mode=demo_mode),
        inputs=inputs,
        outputs=[
            gr.components.Textbox(label="Transcription", value=lambda: "", max_lines=10),
-            gr.components.Textbox(label="Summary"),
+            gr.components.Textbox(label="Summary or Status Message"),
            gr.components.File(label="Download Transcription as JSON"),
            gr.components.File(label="Download Summary as text", visible=lambda summary_file_path: summary_file_path is not None)
        ],
        title="Video Transcription and Summarization",
        description="Submit a video URL for transcription and summarization.",
        allow_flagging="never",
-        theme='bethecloud/storj_theme'
+        #https://huggingface.co/spaces/bethecloud/storj_theme
+        theme="bethecloud/storj_theme"
+        # FIXME - Figure out how to enable dark mode...
+        # other themes: https://huggingface.co/spaces/gradio/theme-gallery
    )

    iface.launch(share=True)
@@ -1262,7 +1313,7 @@ def launch_ui(demo_mode=False):
 ####################################################################################################################################
 # Main()
 #
-def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model="small.en", offset=0, vad_filter=False, download_video_flag=False):
+def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model="small.en", offset=0, vad_filter=False, download_video_flag=False, demo_mode=False):
    if input_path is None and args.user_interface:
        return []
    start_time = time.monotonic()
@@ -1325,61 +1376,72 @@ def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model=
                    logging.debug(f"MAIN: Summarization being performed by {api_name}")
                    json_file_path = audio_file.replace('.wav', '.segments.json')
                    if api_name.lower() == 'openai':
-                        api_key = openai_api_key
                        try:
                            logging.debug(f"MAIN: trying to summarize with openAI")                            
+                            api_key = openai_api_key
+                            logging.debug(f"OpenAI: OpenAI API Key: {api_key}")
                            summary = summarize_with_openai(api_key, json_file_path, openai_model)
                        except requests.exceptions.ConnectionError:
                            r.status_code = "Connection: "
                    elif api_name.lower() == 'anthropic':
-                        api_key = anthropic_api_key
                        try:
-                            logging.debug(f"MAIN: Trying to summarize with anthropic")
+                            logging.debug("MAIN: Trying to summarize with anthropic")
+                            api_key = anthropic_api_key
+                            logging.debug(f"Anthropic: Anthropic API Key: {api_key}")
                            summary = summarize_with_claude(api_key, json_file_path, anthropic_model)
                        except requests.exceptions.ConnectionError:
                            r.status_code = "Connection: "
                    elif api_name.lower() == 'cohere':
-                        api_key = cohere_api_key
                        try:
-                            logging.debug(f"MAIN: Trying to summarize with cohere")
+                            logging.debug("Main: Trying to summarize with cohere")
+                            api_key = cohere_api_key
+                            logging.debug(f"Cohere: Cohere API Key: {api_key}")
                            summary = summarize_with_cohere(api_key, json_file_path, cohere_model)
                        except requests.exceptions.ConnectionError:
                            r.status_code = "Connection: "
                    elif api_name.lower() == 'groq':
-                        api_key = groq_api_key
                        try:
-                            logging.debug(f"MAIN: Trying to summarize with Groq")
+                            logging.debug("Main: Trying to summarize with Groq")
+                            api_key = groq_api_key
+                            logging.debug(f"Groq: Groq API Key: {api_key}")
                            summary = summarize_with_groq(api_key, json_file_path, groq_model)
                        except requests.exceptions.ConnectionError:
                            r.status_code = "Connection: "
                    elif api_name.lower() == 'llama':
-                        token = llama_api_key
-                        llama_ip = llama_api_IP
                        try:
-                            logging.debug(f"MAIN: Trying to summarize with Llama.cpp")
+                            logging.debug("Main: Trying to summarize with Llama.cpp")
+                            token = llama_api_key
+                            logging.debug(f"Llama.cpp: Llama.cpp API Key: {api_key}")
+                            llama_ip = llama_api_IP
+                            logging.debug(f"Llama.cpp: Llama.cpp API IP:Port : {llama_ip}")
                            summary = summarize_with_llama(llama_ip, json_file_path, token)
                        except requests.exceptions.ConnectionError:
                            r.status_code = "Connection: "
                    elif api_name.lower() == 'kobold':
-                        token = kobold_api_key
-                        kobold_ip = kobold_api_IP
                        try:
-                            logging.debug(f"MAIN: Trying to summarize with kobold.cpp")
+                            logging.debug("Main: Trying to summarize with kobold.cpp")
+                            token = kobold_api_key
+                            logging.debug(f"kobold.cpp: Kobold.cpp API Key: {api_key}")
+                            kobold_ip = kobold_api_IP
+                            logging.debug(f"kobold.cpp: Kobold.cpp API IP:Port : {kobold_api_IP}")
                            summary = summarize_with_kobold(kobold_ip, json_file_path)
                        except requests.exceptions.ConnectionError:
                            r.status_code = "Connection: "
                    elif api_name.lower() == 'ooba':
-                        token = ooba_api_key
-                        ooba_ip = ooba_api_IP
                        try:
-                            logging.debug(f"MAIN: Trying to summarize with oobabooga")
+                            logging.debug("Main: Trying to summarize with oobabooga")
+                            token = ooba_api_key
+                            logging.debug(f"oobabooga: ooba API Key: {api_key}")
+                            ooba_ip = ooba_api_IP
+                            logging.debug(f"oobabooga: ooba API IP:Port : {ooba_ip}")
                            summary = summarize_with_oobabooga(ooba_ip, json_file_path)
                        except requests.exceptions.ConnectionError:
                            r.status_code = "Connection: "
                    if api_name.lower() == 'huggingface':
-                        api_key = huggingface_api_key
                        try:
-                            logging.debug(f"MAIN: Trying to summarize with huggingface")
+                            logging.debug("MAIN: Trying to summarize with huggingface")
+                            api_key = huggingface_api_key
+                            logging.debug(f"huggingface: huggingface API Key: {api_key}")
                            summarize_with_huggingface(api_key, json_file_path)
                        except requests.exceptions.ConnectionError:
                            r.status_code = "Connection: "
@@ -1411,7 +1473,6 @@ if __name__ == "__main__":
    parser.add_argument('input_path', type=str, help='Path or URL of the video', nargs='?')
    parser.add_argument('-v','--video',  action='store_true', help='Download the video instead of just the audio')
    parser.add_argument('-api', '--api_name', type=str, help='API name for summarization (optional)')
-    parser.add_argument('-key', '--api_key', type=str, help='API key for summarization (optional)')
    parser.add_argument('-ns', '--num_speakers', type=int, default=2, help='Number of speakers (default: 2)')
    parser.add_argument('-wm', '--whisper_model', type=str, default='small.en', help='Whisper model (default: small.en)')
    parser.add_argument('-off', '--offset', type=int, default=0, help='Offset in seconds (default: 0)')
--- a/config.txt
+++ b/config.txt
@@ -7,7 +7,7 @@ groq_api_key = <your_groq_api_key>
 groq_model = llama3-70b-8192
 openai_api_key = <openai_api_key>
 openai_model = gpt-4-turbo
-huggingface_api_token = <huggingface_api_token>
+huggingface_api_key = <huggingface_api_key>
 huggingface_model = CohereForAI/c4ai-command-r-plus


--- a/summarize.py
+++ b/summarize.py
@@ -67,10 +67,20 @@ config.read('config.txt')

 # API Keys
 anthropic_api_key = config.get('API', 'anthropic_api_key', fallback=None)
+logging.debug(f"Loaded Anthropic API Key: {anthropic_api_key}")
+
 cohere_api_key = config.get('API', 'cohere_api_key', fallback=None)
+logging.debug(f"Loaded cohere API Key: {cohere_api_key}")
+
 groq_api_key = config.get('API', 'groq_api_key', fallback=None)
+logging.debug(f"Loaded groq API Key: {groq_api_key}")
+
 openai_api_key = config.get('API', 'openai_api_key', fallback=None)
+logging.debug(f"Loaded openAI Face API Key: {openai_api_key}")
+
 huggingface_api_key = config.get('API', 'huggingface_api_key', fallback=None)
+logging.debug(f"Loaded HuggingFace Face API Key: {huggingface_api_key}")
+

 # Models
 anthropic_model = config.get('API', 'anthropic_model', fallback='claude-3-sonnet-20240229')
@@ -564,6 +574,7 @@ def convert_to_wav(video_file_path, offset=0):
    except Exception as e:
        logging.error("Unexpected error occurred: %s", str(e))
        raise RuntimeError("Error converting video file to WAV")
+        exit()
    return out_path


@@ -746,10 +757,10 @@ def speaker_diarize(video_file_path, segments, embedding_model = "pyannote/embed
 #
 #

-# Summarize with OpenAI ChatGPT
 def extract_text_from_segments(segments):
-    logging.debug(f"openai: extracting text from {segments}")
+    logging.debug(f"Main: extracting text from {segments}")
    text = ' '.join([segment['text'] for segment in segments])
+    logging.debug(f"Main: Successfully extracted text from {segments}")
    return text


@@ -1150,19 +1161,26 @@ def summarize_with_huggingface(api_key, file_path):
            segments = json.load(file)
        
        logging.debug("huggingface: Extracting text from the segments")
+        logging.debug(f"huggingface: Segments: {segments}")
        text = ' '.join([segment['text'] for segment in segments])

-        api_key = os.environ.get('HF_TOKEN')
+
+# API KEY ASSIGNMENT HERE
+        api_key = huggingface_api_key
+        print(f"huggingface: lets make sure the HF api key exists...\n\t {huggingface_api_key}" )
        headers = {
-            "Authorization": f"Bearer {api_key}"
+            "Authorization": f"Bearer {huggingface_api_key}"
        }
+
        model = "microsoft/Phi-3-mini-128k-instruct"
        API_URL = f"https://api-inference.huggingface.co/models/{model}"
        data = {
            "inputs": text,
            "parameters": {"max_length": 512, "min_length": 100}  # You can adjust max_length and min_length as needed
        }
-        
+
+        print(f"huggingface: lets make sure the HF api key is the same..\n\t {huggingface_api_key}")
+
        logging.debug("huggingface: Submitting request...")
        response = requests.post(API_URL, headers=headers, json=data)
        
@@ -1213,14 +1231,15 @@ def launch_ui(demo_mode=False):
        inputs=inputs,
        outputs=[
            gr.components.Textbox(label="Transcription", value=lambda: "", max_lines=10),
-            gr.components.Textbox(label="Summary"),
+            gr.components.Textbox(label="Summary or Status Message"),
            gr.components.File(label="Download Transcription as JSON"),
            gr.components.File(label="Download Summary as text", visible=lambda summary_file_path: summary_file_path is not None)
        ],
        title="Video Transcription and Summarization",
        description="Submit a video URL for transcription and summarization.",
        allow_flagging="never",
-        theme='bethecloud/storj_theme'
+        #https://huggingface.co/spaces/bethecloud/storj_theme
+        theme="bethecloud/storj_theme"
    )

    iface.launch(share=True)
@@ -1382,7 +1401,6 @@ if __name__ == "__main__":
    parser.add_argument('input_path', type=str, help='Path or URL of the video', nargs='?')
    parser.add_argument('-v','--video',  action='store_true', help='Download the video instead of just the audio')
    parser.add_argument('-api', '--api_name', type=str, help='API name for summarization (optional)')
-    parser.add_argument('-key', '--api_key', type=str, help='API key for summarization (optional)')
    parser.add_argument('-ns', '--num_speakers', type=int, default=2, help='Number of speakers (default: 2)')
    parser.add_argument('-wm', '--whisper_model', type=str, default='small.en', help='Whisper model (default: small.en)')
    parser.add_argument('-off', '--offset', type=int, default=0, help='Offset in seconds (default: 0)')
@@ -1415,8 +1433,10 @@ if __name__ == "__main__":
        if args.api_name and args.api_key:
            logging.info(f'API: {args.api_name}')
            logging.info('Summarization will be performed.')
+            summary = None  # Initialize to ensure it's always defined
        else:
            logging.info('No API specified. Summarization will not be performed.')
+            summary = None  # Initialize to ensure it's always defined

        logging.debug("Platform check being performed...")
        platform_check()