Spaces:

RASMUS
/

Youtube-videos-with-crosslingual-transcriptions

Build error

App Files Files Community

RASMUS commited on Dec 21, 2022

Commit

7ed286f

1 Parent(s): 797b8a0

Create app.py

Browse files

Files changed (1) hide show

app.py +367 -0

app.py ADDED Viewed

	@@ -0,0 +1,367 @@

+import gradio as gr
+import os
+from pathlib import Path
+import time
+import pandas as pd
+import re
+import time
+import os
+import whisper
+from pytube import YouTube
+import psutil
+num_cores = psutil.cpu_count()
+os.environ["OMP_NUM_THREADS"] = f"{num_cores}"
+import torch
+# is cuda available?
+from easynmt import EasyNMT
+translation_model = EasyNMT('m2m_100_418M', max_new_tokens=60)
+asr_model = whisper.load_model("base")
+transcribe_options = dict(beam_size=3, best_of=3, without_timestamps=False)
+translation_models = {
+"Afrikaans":"af",
+"Amharic":"am",
+"Arabic":"ar",
+"Asturian ":"st",
+"Azerbaijani":"az",
+"Bashkir":"ba",
+"Belarusian":"be",
+"Bulgarian":"bg",
+"Bengali":"bn",
+"Breton":"br",
+"Bosnian":"bs",
+"Catalan; Valencian":"ca",
+"Cebuano":"eb",
+"Czech":"cs",
+"Welsh":"cy",
+"Danish":"da",
+"German":"de",
+"Greeek":"el",
+"English":"en",
+"Spanish":"es",
+"Estonian":"et",
+"Persian":"fa",
+"Fulah":"ff",
+"Finnish":"fi",
+"French":"fr",
+"Western Frisian":"fy",
+"Irish":"ga",
+"Gaelic; Scottish Gaelic":"gd",
+"Galician":"gl",
+"Gujarati":"gu",
+"Hausa":"ha",
+"Hebrew":"he",
+"Hindi":"hi",
+"Croatian":"hr",
+"Haitian; Haitian Creole":"ht",
+"Hungarian":"hu",
+"Armenian":"hy",
+"Indonesian":"id",
+"Igbo":"ig",
+"Iloko":"lo",
+"Icelandic":"is",
+"Italian":"it",
+"Japanese":"ja",
+"Javanese":"jv",
+"Georgian":"ka",
+"Kazakh":"kk",
+"Central Khmer":"km",
+"Kannada":"kn",
+"Korean":"ko",
+"Luxembourgish; Letzeburgesch":"lb",
+"Ganda":"lg",
+"Lingala":"ln",
+"Lao":"lo",
+"Lithuanian":"lt",
+"Latvian":"lv",
+"Malagasy":"mg",
+"Macedonian":"mk",
+"Malayalam":"ml",
+"Mongolian":"mn",
+"Marathi":"mr",
+"Malay":"ms",
+"Burmese":"my",
+"Nepali":"ne",
+"Dutch; Flemish":"nl",
+"Norwegian":"no",
+"Northern Sotho":"ns",
+"Occitan (post 1500)":"oc",
+"Oriya":"or",
+"Panjabi; Punjabi":"pa",
+"Polish":"pl",
+"Pushto; Pashto":"ps",
+"Portuguese":"pt",
+"Romanian; Moldavian; Moldovan":"ro",
+"Russian":"ru",
+"Sindhi":"sd",
+"Sinhala; Sinhalese":"si",
+"Slovak":"sk",
+"Slovenian":"sl",
+"Somali":"so",
+"Albanian":"sq",
+"Serbian":"sr",
+"Swati":"ss",
+"Sundanese":"su",
+"Swedish":"sv",
+"Swahili":"sw",
+"Tamil":"ta",
+"Thai":"th",
+"Tagalog":"tl",
+"Tswana":"tn",
+"Turkish":"tr",
+"Ukrainian":"uk",
+"Urdu":"ur",
+"Uzbek":"uz",
+"Vietnamese":"vi",
+"Wolof":"wo",
+"Xhosa":"xh",
+"Yiddish":"yi",
+"Yoruba":"yo",
+"Chinese":"zh",
+"Zulu":"zu"
+}
+translation_models_list = [key[0] for key in translation_models.items()]
+device = "cpu"#torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print("DEVICE IS: ")
+print(device)
+videos_out_path = Path("./videos_out")
+videos_out_path.mkdir(parents=True, exist_ok=True)
+def get_youtube(video_url):
+    yt = YouTube(video_url)
+    abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
+    print("LADATATTU POLKUUN")
+    print(abs_video_path)
+    return abs_video_path
+async def speech_to_text(video_file_path, selected_translation_lang):
+    """
+    # Youtube with translated subtitles using OpenAI Whisper and Opus-MT models.
+    # Currently supports only English audio
+    This space allows you to:
+    1. Download youtube video with a given url
+    2. Watch it in the first video component
+    3. Run automatic speech recognition on the video using Whisper
+    4. Translate the recognized transcriptions to Finnish, Swedish, Danish
+    5. Burn the translations to the original video and watch the video in the 2nd video component
+    Speech Recognition is based on OpenAI Whisper https://github.com/openai/whisper
+    """
+    if(video_file_path == None):
+        raise ValueError("Error no video input")
+    print(video_file_path)
+    try:
+        audio = whisper.load_audio(video_file_path)
+    except Exception as e:
+        raise RuntimeError("Error converting video to audio")
+    last_time = time.time()
+    try:
+        print(f'Transcribing via local model')
+        transcribe_options = dict(beam_size=5, best_of=5, without_timestamps=False)
+        transcription = asr_model.transcribe(audio, **transcribe_options)
+        #translation_options = dict(language=selected_translation_lang, beam_size=5, best_of=5, without_timestamps=False)
+        #translations = asr_model.transcribe(audio, **translation_options)
+        df = pd.DataFrame(columns=['start','end','text'])
+        for i,segment in enumerate(transcription['segments']):
+            new_row = {'start': segment['start'],
+            'end': segment['end'],
+            'text': segment['text']
+                            }
+            df = df.append(new_row, ignore_index=True)
+        if selected_translation_lang is None:
+                    selected_translation_lang = 'Finnish'
+        sentences = df['text']
+        df['translation'] = translation_model.translate(sentences, target_lang=translation_models.get(selected_translation_lang), max_new_tokens = 50)
+        print('After translation to target language \n')
+        return (df)
+    except Exception as e:
+        raise RuntimeError("Error Running inference with local model", e)
+def create_srt_and_burn(df, video_in):
+    print("Starting creation of video wit srt")
+    with open('testi.srt','w', encoding="utf-8") as file:
+        for i in range(len(df)):
+            file.write(str(i+1))
+            file.write('\n')
+            start = df.iloc[i]['start']
+            milliseconds = round(start * 1000.0)
+            hours = milliseconds // 3_600_000
+            milliseconds -= hours * 3_600_000
+            minutes = milliseconds // 60_000
+            milliseconds -= minutes * 60_000
+            seconds = milliseconds // 1_000
+            milliseconds -= seconds * 1_000
+            file.write(f"{hours}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}")
+            stop = df.iloc[i]['end']
+            milliseconds = round(stop * 1000.0)
+            hours = milliseconds // 3_600_000
+            milliseconds -= hours * 3_600_000
+            minutes = milliseconds // 60_000
+            milliseconds -= minutes * 60_000
+            seconds = milliseconds // 1_000
+            milliseconds -= seconds * 1_000
+            file.write(' --> ')
+            file.write(f"{hours}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}")
+            file.write('\n')
+            file.writelines(df.iloc[i]['translation'])
+            if int(i) != len(df)-1:
+                file.write('\n\n')
+    print("SRT DONE")
+    try:
+        file1 = open('./testi.srt', 'r', encoding="utf-8")
+        Lines = file1.readlines()
+        count = 0
+        # Strips the newline character
+        for line in Lines:
+            count += 1
+            print("{}".format(line))
+        print(type(video_in))
+        print(video_in)
+        video_out = video_in.replace('.mp4', '_out.mp4')
+        print(video_out)
+        command = 'ffmpeg -i "{}" -y -vf subtitles=./testi.srt "{}"'.format(video_in, video_out)
+        print(command)
+        os.system(command)
+        return video_out
+    except Exception as e:
+        print(e)
+        return video_out
+# ---- Gradio Layout -----
+video_in = gr.Video(label="Video file", mirror_webcam=False)
+youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
+video_out = gr.Video(label="Video Out", mirror_webcam=False)
+df_init = pd.DataFrame(columns=['start','end','text','translation'])
+selected_translation_lang = gr.Dropdown(choices=translation_models_list, type="value", value="English", label="Language to translate transcriptions to", interactive=True)
+transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10)
+demo = gr.Blocks(css='''
+#cut_btn, #reset_btn { align-self:stretch; }
+#\\31 3 { max-width: 540px; }
+.output-markdown {max-width: 65ch !important;}
+''')
+demo.encrypt = False
+with demo:
+    transcription_var = gr.Variable()
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown('''
+            ### This space allows you to:
+            ##### 1. Download youtube video with a given URL
+            ##### 2. Watch it in the first video component
+            ##### 3. Run automatic speech recognition on the video using Whisper (Please remember to select translation language)
+            ##### 4. Translate the recognized transcriptions to Finnish, Swedish, Danish
+            ##### 5. Burn the translations to the original video and watch the video in the 2nd video component
+            ''')
+        with gr.Column():
+            gr.Markdown('''
+            ### 1. Insert Youtube URL below (Some examples below which I suggest to use for first tests)
+            ##### 1. https://www.youtube.com/watch?v=nlMuHtV82q8&ab_channel=NothingforSale24
+            ##### 2. https://www.youtube.com/watch?v=JzPfMbG1vrE&ab_channel=ExplainerVideosByLauren
+            ##### 3. https://www.youtube.com/watch?v=S68vvV0kod8&ab_channel=Pearl-CohnTelevision
+            ''')
+    with gr.Row():
+        with gr.Column():
+            youtube_url_in.render()
+            download_youtube_btn = gr.Button("Step 1. Download Youtube video")
+            download_youtube_btn.click(get_youtube, [youtube_url_in], [
+                video_in])
+            print(video_in)
+    with gr.Row():
+        with gr.Column():
+            video_in.render()
+            with gr.Column():
+                gr.Markdown('''
+                ##### Here you can start the transcription and translation process.
+                ##### Be aware that processing will last for a while (35 second video took around 20 seconds in my testing)
+                ''')
+            transcribe_btn = gr.Button("Step 2. Transcribe and translate audio")
+            transcribe_btn.click(speech_to_text, [video_in, selected_translation_lang], transcription_df)
+    with gr.Row():
+        with gr.Column():
+            selected_translation_lang.render()
+    with gr.Row():
+        gr.Markdown('''
+        ##### Here you will get transcription and translation output
+        ##### If you see error please remember to select translation language
+        ##### ''')
+    with gr.Row():
+        with gr.Column():
+            transcription_df.render()
+    with gr.Row():
+        with gr.Column():
+            translate_and_make_srt_btn = gr.Button("Step 3. Create and burn srt to video")
+            print(video_in)
+            translate_and_make_srt_btn.click(create_srt_and_burn, [transcription_df,video_in], [
+                video_out])
+            video_out.render()
+if __name__ == "__main__":
+    demo.queue().launch(debug=True, share=False, enable_queue=True)