inital commit

2023-10-04 16:18:18 +02:00 · 2023-10-04 16:18:18 +02:00 · 6f9cc3a2c4
commit 6f9cc3a2c4
4 changed files with 120 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,5 @@
+speech/
+__pycache__
+audio*
+*.wav
+*.mp3
--- a/README.md
+++ b/README.md
@ -0,0 +1,21 @@
+# Podcast filter
+This program takes a wav file and produces the transcript of the audio file. The goal is to be able to filter the parts of a podcast that you're interested in by using a keyword list. But it's still a work in progress.
+
+## Installation on GNU+linux
+### Step 1
+Clone the repository
+
+### Step 2
+Go to the folder of the repository and reate a virtual environment
+bash```
+python -m venv <name>
+```
+
+### Step 3
+Install the requirements (there's a ton because we use the whisper engine for speech to text)
+bash```
+pip install -r requirements.txt
+```
+
+## Usage
+Change the name of the file you want to transcribe in the code, make sure it's on the same folder as the program (or give the path in the code). Run the code ```python app.py```. Enjoy the transcript.
--- a/app.py
+++ b/app.py
@ -0,0 +1,53 @@
+import speech_recognition as sr
+import os 
+from pydub import AudioSegment
+from pydub.silence import split_on_silence
+
+# sound = AudioSegment.from_mp3("test.mp3")
+# sound.export("test.wav", format="wav")
+
+fname = "ciberseguretat.wav"
+keyWords = ['ciberseguretat', 'hacker', 'atac', 'pentesting']
+
+r = sr.Recognizer()
+
+def transcript_audio(audio):
+    with sr.AudioFile(fname) as source:
+        audio_data = r.record(source)
+        text = r.recognize_whisper(audio_data, language='ca')
+    return(text)
+
+def large_audio(path, minutes=5):
+    """Splitting the large audio file into fixed interval chunks
+    and apply speech recognition on each of these chunks"""
+    print("Loading file")
+    sound = AudioSegment.from_file(path)  
+    print(len(sound))
+    print("Splitting file")
+    chunk_length_ms = int(1000 * 60 * minutes) # convert to milliseconds
+    chunks = [sound[i:i + chunk_length_ms] for i in range(0, len(sound), chunk_length_ms)]
+    folder_name = "audio-fixed-chunks"
+    
+    if not os.path.isdir(folder_name):
+        os.mkdir(folder_name)
+    whole_text = ""
+    print("Starting transcription")
+    for i, audio_chunk in enumerate(chunks, start=1):
+        # export audio chunk and save it in
+        # the `folder_name` directory.
+        chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
+        audio_chunk.export(chunk_filename, format="wav")
+        # recognize the chunk
+        try:
+            text = transcript_audio(chunk_filename)
+        except sr.UnknownValueError as e:
+            print("Error:", str(e))
+        else:
+            text = f"{text.capitalize()}. "
+            print(chunk_filename, ":", text)
+            whole_text += text
+    # return the text for all chunks detected
+    return whole_text
+
+if __name__=="__main__":
+    print(large_audio(fname))
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,41 @@
+certifi==2023.7.22
+cffi==1.16.0
+charset-normalizer==3.3.0
+cmake==3.27.6
+filelock==3.12.4
+idna==3.4
+Jinja2==3.1.2
+lit==17.0.2
+llvmlite==0.41.0
+MarkupSafe==2.1.3
+more-itertools==10.1.0
+mpmath==1.3.0
+networkx==3.1
+numba==0.58.0
+numpy==1.25.2
+nvidia-cublas-cu11==11.10.3.66
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+nvidia-cuda-runtime-cu11==11.7.99
+nvidia-cudnn-cu11==8.5.0.96
+nvidia-cufft-cu11==10.9.0.58
+nvidia-curand-cu11==10.2.10.91
+nvidia-cusolver-cu11==11.4.0.1
+nvidia-cusparse-cu11==11.7.4.91
+nvidia-nccl-cu11==2.14.3
+nvidia-nvtx-cu11==11.7.91
+openai-whisper @ git+https://github.com/openai/whisper.git@0a60fcaa9b86748389a656aa013c416030287d47
+pycparser==2.21
+pydub==0.25.1
+regex==2023.10.3
+requests==2.31.0
+six==1.16.0
+soundfile==0.12.1
+SpeechRecognition==3.10.0
+sympy==1.12
+tiktoken==0.3.3
+torch==2.0.1
+tqdm==4.66.1
+triton==2.0.0
+typing_extensions==4.8.0
+urllib3==2.0.6