inital commit

This commit is contained in:
Xaloc 2023-10-04 16:18:18 +02:00
commit 6f9cc3a2c4
4 changed files with 120 additions and 0 deletions

5
.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
speech/
__pycache__
audio*
*.wav
*.mp3

21
README.md Normal file
View File

@ -0,0 +1,21 @@
# Podcast filter
This program takes a wav file and produces the transcript of the audio file. The goal is to be able to filter the parts of a podcast that you're interested in by using a keyword list. But it's still a work in progress.
## Installation on GNU+linux
### Step 1
Clone the repository
### Step 2
Go to the folder of the repository and reate a virtual environment
bash```
python -m venv <name>
```
### Step 3
Install the requirements (there's a ton because we use the whisper engine for speech to text)
bash```
pip install -r requirements.txt
```
## Usage
Change the name of the file you want to transcribe in the code, make sure it's on the same folder as the program (or give the path in the code). Run the code ```python app.py```. Enjoy the transcript.

53
app.py Normal file
View File

@ -0,0 +1,53 @@
import speech_recognition as sr
import os
from pydub import AudioSegment
from pydub.silence import split_on_silence
# sound = AudioSegment.from_mp3("test.mp3")
# sound.export("test.wav", format="wav")
fname = "ciberseguretat.wav"
keyWords = ['ciberseguretat', 'hacker', 'atac', 'pentesting']
r = sr.Recognizer()
def transcript_audio(audio):
with sr.AudioFile(fname) as source:
audio_data = r.record(source)
text = r.recognize_whisper(audio_data, language='ca')
return(text)
def large_audio(path, minutes=5):
"""Splitting the large audio file into fixed interval chunks
and apply speech recognition on each of these chunks"""
print("Loading file")
sound = AudioSegment.from_file(path)
print(len(sound))
print("Splitting file")
chunk_length_ms = int(1000 * 60 * minutes) # convert to milliseconds
chunks = [sound[i:i + chunk_length_ms] for i in range(0, len(sound), chunk_length_ms)]
folder_name = "audio-fixed-chunks"
if not os.path.isdir(folder_name):
os.mkdir(folder_name)
whole_text = ""
print("Starting transcription")
for i, audio_chunk in enumerate(chunks, start=1):
# export audio chunk and save it in
# the `folder_name` directory.
chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
audio_chunk.export(chunk_filename, format="wav")
# recognize the chunk
try:
text = transcript_audio(chunk_filename)
except sr.UnknownValueError as e:
print("Error:", str(e))
else:
text = f"{text.capitalize()}. "
print(chunk_filename, ":", text)
whole_text += text
# return the text for all chunks detected
return whole_text
if __name__=="__main__":
print(large_audio(fname))

41
requirements.txt Normal file
View File

@ -0,0 +1,41 @@
certifi==2023.7.22
cffi==1.16.0
charset-normalizer==3.3.0
cmake==3.27.6
filelock==3.12.4
idna==3.4
Jinja2==3.1.2
lit==17.0.2
llvmlite==0.41.0
MarkupSafe==2.1.3
more-itertools==10.1.0
mpmath==1.3.0
networkx==3.1
numba==0.58.0
numpy==1.25.2
nvidia-cublas-cu11==11.10.3.66
nvidia-cuda-cupti-cu11==11.7.101
nvidia-cuda-nvrtc-cu11==11.7.99
nvidia-cuda-runtime-cu11==11.7.99
nvidia-cudnn-cu11==8.5.0.96
nvidia-cufft-cu11==10.9.0.58
nvidia-curand-cu11==10.2.10.91
nvidia-cusolver-cu11==11.4.0.1
nvidia-cusparse-cu11==11.7.4.91
nvidia-nccl-cu11==2.14.3
nvidia-nvtx-cu11==11.7.91
openai-whisper @ git+https://github.com/openai/whisper.git@0a60fcaa9b86748389a656aa013c416030287d47
pycparser==2.21
pydub==0.25.1
regex==2023.10.3
requests==2.31.0
six==1.16.0
soundfile==0.12.1
SpeechRecognition==3.10.0
sympy==1.12
tiktoken==0.3.3
torch==2.0.1
tqdm==4.66.1
triton==2.0.0
typing_extensions==4.8.0
urllib3==2.0.6