Python | | |

SPEECH PROCESSING

Understanding Audio data, Fourier Transform, FFT and Spectrogram features for a Speech Recognition System

An introduction to audio data analysis (sound analysis) using python

Speech processing is the study of speech signals and the processing methods of signals. The signals are usually processed in a digital representation, so speech processing can be regarded as a special case of digital signal processing, applied to speech signals. Aspects of speech processing includes the acquisition, manipulation, storage, transfer and output of speech signals. The input is called speech recognition and the output is called speech synthesis.

READ AUDIO FILE

import wave

f=wave.open('/content/drive/My Drive/Colab Notebooks/colab_tutorials/file.wav','rb')

data = f.readframes(chunk)

PLAY AUDIO FILE

import pyaudio
import wave

#define stream chunk
chunk = 1024

#open a wav format music
f = wave.open("/home/ihrd/Avigyan/whitenoisegaussian.wav","rb")
#instantiate PyAudio
p = pyaudio.PyAudio()
#open stream
stream = p.open(format = p.get_format_from_width(f.getsampwidth()),
channels = f.getnchannels(),
rate = f.getframerate(),
output = True)
#read data
data = f.readframes(chunk)

#play stream
while data:
stream.write(data)
data = f.readframes(chunk)

#stop stream
stream.stop_stream()
stream.close()

#close PyAudio
p.terminate()

READ AND PLOT SPEECH DATA

import matplotlib.pyplot as plt

from scipy.fftpack import fft

from scipy.io import wavfile # get the api

import numpy as np

fs, data = wavfile.read('/content/drive/MyDrive/Colab Notebooks/colab_tutorials/OAF_base_happy.wav')

plt.plot(data,'g')

plt.show()

FAST FOURIER TRANSFORM

from scipy.io import wavfile # get the api

import numpy as np

fs, data = wavfile.read('file.wav') # load the data

a = data.T[0] # this is a two channel soundtrack, I get the first track

b=[(ele/2**8.)*2-1 for ele in a] # this is 8-bit track, b is now normalized on [-1,1)

c = fft(a) # calculate fourier transform (complex numbers list)

d = len(c)/2 # you only need half of the fft list (real signal symmetry)

plt.plot(abs(c[:(d-1)]),'r')

plt.show()

SPECTROGRAM USING LIBROSA

_wav_file_ = "africa-toto.wav"

def spectogram_librosa(_wav_file_):
    import librosa
    import pylab
    import numpy as np
    
    (sig, rate) = librosa.load(_wav_file_, sr=None, mono=True,  dtype=np.float32)
    pylab.specgram(sig, Fs=rate)
    pylab.savefig('spectrogram3.png')

def graph_spectrogram_wave(wav_file):
    import wave
    import pylab
    def get_wav_info(wav_file):
        wav = wave.open(wav_file, 'r')
        frames = wav.readframes(-1)
        sound_info = pylab.fromstring(frames, 'int16')
        frame_rate = wav.getframerate()
        wav.close()
        return sound_info, frame_rate
    sound_info, frame_rate = get_wav_info(wav_file)
    pylab.figure(num=3, figsize=(10, 6))
    pylab.title('spectrogram pylab with wav_file')
    pylab.specgram(sound_info, Fs=frame_rate)
    pylab.savefig('spectrogram2.png')


def graph_wavfileread(_wav_file_):
    import matplotlib.pyplot as plt
    from scipy import signal
    from scipy.io import wavfile
    import numpy as np   
    sample_rate, samples = wavfile.read(_wav_file_)   
    frequencies, times, spectrogram = signal.spectrogram(samples,sample_rate,nfft=1024)
    plt.pcolormesh(times, frequencies, 10*np.log10(spectrogram))
    plt.ylabel('Frequency [Hz]')
    plt.xlabel('Time [sec]')
    plt.savefig("spectogram1.png")
    

spectogram_librosa(_wav_file_)
#graph_wavfileread(_wav_file_)
#graph_spectrogram_wave(_wav_file_)

FFT SIDE

from __future__ import print_function

import scipy.io.wavfile as wavfile

import scipy

import scipy.fftpack

import numpy as np

from matplotlib import pyplot as plt

fs_rate, signal = wavfile.read("/content/drive/My Drive/Colab Notebooks/colab_tutorials/file.wav")

print ("Frequency sampling", fs_rate)

l_audio = len(signal.shape)

print ("Channels", l_audio)

if l_audio == 2:

signal = signal.sum(axis=1) / 2

N = signal.shape[0]

print ("Complete Samplings N", N)

secs = N / float(fs_rate)

print ("secs", secs)

Ts = 1.0/fs_rate # sampling interval in time

print ("Timestep between samples Ts", Ts)

t = scipy.arange(0, secs, Ts) # time vector as scipy arange field / numpy.ndarray

FFT = abs(scipy.fft(signal))

FFT_side = FFT[range(N//2)] # one side FFT range

freqs = scipy.fftpack.fftfreq(signal.size, t[1]-t[0])

fft_freqs = np.array(freqs)

freqs_side = freqs[range(N//2)] # one side frequency range

fft_freqs_side = np.array(freqs_side)

plt.subplot(311)

p1 = plt.plot(t, signal, "g") # plotting the signal

plt.xlabel('Time')

plt.ylabel('Amplitude')

plt.subplot(312)

p2 = plt.plot(freqs, FFT, "r") # plotting the complete fft spectrum

plt.xlabel('Frequency (Hz)')

plt.ylabel('Count dbl-sided')

plt.subplot(313)

p3 = plt.plot(freqs_side, abs(FFT_side), "b") # plotting the positive fft spectrum

plt.xlabel('Frequency (Hz)')

plt.ylabel('Count single-sided')

plt.show()

AUDIO PLAY USING PYGAME

import pygame
file = 'rp.mp3'
pygame.init()
pygame.mixer.init()
pygame.mixer.music.load(file)
pygame.mixer.music.play()
while pygame.mixer.music.get_busy(): 
    pygame.time.Clock().tick(10)

MFCC theory and implementation

Theory

Mel Frequency Cepstral Coefficents (MFCCs) is a way of extracting features from an audio. The MFCC uses the MEL scale to divide the frequency band to sub-bands and then extracts the Cepstral Coefficents using Discrete Cosine Transform (DCT). MEL scale is based on the way humans distinguish between frequencies which makes it very convenient to process sounds.


tbody>

def freq_to_mel(freq): return 2595.0 * np.log10(1.0 + freq / 700.0) def met_to_freq(mels): return 700.0 * (10.0**(mels / 2595.0) - 1.0)

def normalize_audio(audio): audio = audio / np.max(np.abs(audio)) return audio

def frame_audio(audio, FFT_size=2048, hop_size=10, sample_rate=44100):
    # hop_size in ms
    
    audio = np.pad(audio, int(FFT_size / 2), mode='reflect')
    frame_len = np.round(sample_rate * hop_size / 1000).astype(int)
    frame_num = int((len(audio) - FFT_size) / frame_len) + 1
    frames = np.zeros((frame_num,FFT_size))
    
    for n in range(frame_num):
        frames[n] = audio[n*frame_len:n*frame_len+FFT_size]
    
    return frames

def get_filter_points(fmin, fmax, mel_filter_num, FFT_size, sample_rate=44100):
    fmin_mel = freq_to_mel(fmin)
    fmax_mel = freq_to_mel(fmax)
    
    print("MEL min: {0}".format(fmin_mel))
    print("MEL max: {0}".format(fmax_mel))
    
    mels = np.linspace(fmin_mel, fmax_mel, num=mel_filter_num+2)
    freqs = met_to_freq(mels)
    
    return np.floor((FFT_size + 1) / sample_rate * freqs).astype(int), freqs

def get_filters(filter_points, FFT_size):
    filters = np.zeros((len(filter_points)-2,int(FFT_size/2+1)))
    
    for n in range(len(filter_points)-2):
        filters[n, filter_points[n] : filter_points[n + 1]] = np.linspace(0, 1, filter_points[n + 1] - filter_points[n])
        filters[n, filter_points[n + 1] : filter_points[n + 2]] = np.linspace(1, 0, filter_points[n + 2] - filter_points[n + 1])
    
    return filters

def dct(dct_filter_num, filter_len):
    basis = np.empty((dct_filter_num,filter_len))
    basis[0, :] = 1.0 / np.sqrt(filter_len)
    
    samples = np.arange(1, 2 * filter_len, 2) * np.pi / (2.0 * filter_len)

    for i in range(1, dct_filter_num):
        basis[i, :] = np.cos(i * samples) * np.sqrt(2.0 / filter_len)
        
    return basis

Main PGM

import os
import numpy as np
import scipy
from scipy.io import wavfile
import scipy.fftpack as fft
from scipy.signal import get_window
import IPython.display as ipd
import matplotlib.pyplot as plt

%matplotlib inline

TRAIN_PATH = '../input/audio_train/'
ipd.Audio(TRAIN_PATH + "a439d172.wav")

sample_rate, audio = wavfile.read(TRAIN_PATH + "a439d172.wav")
print("Sample rate: {0}Hz".format(sample_rate))
print("Audio duration: {0}s".format(len(audio) / sample_rate))

audio = normalize_audio(audio)
plt.figure(figsize=(15,4))
plt.plot(np.linspace(0, len(audio) / sample_rate, num=len(audio)), audio)
plt.grid(True)

hop_size = 15 #ms
FFT_size = 2048

audio_framed = frame_audio(audio, FFT_size=FFT_size, hop_size=hop_size, sample_rate=sample_rate)
print("Framed audio shape: {0}".format(audio_framed.shape))

print("First frame:")
audio_framed[1]

print("Last frame:")
audio_framed[-1]

window = get_window("hann", FFT_size, fftbins=True)
plt.figure(figsize=(15,4))
plt.plot(window)
plt.grid(True)

audio_win = audio_framed * window

ind = 69
plt.figure(figsize=(15,6))
plt.subplot(2, 1, 1)
plt.plot(audio_framed[ind])
plt.title('Original Frame')
plt.grid(True)
plt.subplot(2, 1, 2)
plt.plot(audio_win[ind])
plt.title('Frame After Windowing')
plt.grid(True)

audio_winT = np.transpose(audio_win)

audio_fft = np.empty((int(1 + FFT_size // 2), audio_winT.shape[1]), dtype=np.complex64, order='F')

for n in range(audio_fft.shape[1]):
    audio_fft[:, n] = fft.fft(audio_winT[:, n], axis=0)[:audio_fft.shape[0]]

audio_fft = np.transpose(audio_fft)

audio_power = np.square(np.abs(audio_fft))
print(audio_power.shape)

freq_min = 0
freq_high = sample_rate / 2
mel_filter_num = 10

print("Minimum frequency: {0}".format(freq_min))
print("Maximum frequency: {0}".format(freq_high))

filter_points, mel_freqs = get_filter_points(freq_min, freq_high, mel_filter_num, FFT_size, sample_rate=44100)
filter_points

filters = get_filters(filter_points, FFT_size)

plt.figure(figsize=(15,4))
for n in range(filters.shape[0]):
    plt.plot(filters[n])

# taken from the librosa library
enorm = 2.0 / (mel_freqs[2:mel_filter_num+2] - mel_freqs[:mel_filter_num])
filters *= enorm[:, np.newaxis]

plt.figure(figsize=(15,4))
for n in range(filters.shape[0]):
    plt.plot(filters[n])

audio_filtered = np.dot(filters, np.transpose(audio_power))
audio_log = 10.0 * np.log10(audio_filtered)
audio_log.shape

dct_filter_num = 40

dct_filters = dct(dct_filter_num, mel_filter_num)

cepstral_coefficents = np.dot(dct_filters, audio_log)
cepstral_coefficents.shape

cepstral_coefficents[:, 0]

plt.figure(figsize=(15,5))
plt.plot(np.linspace(0, len(audio) / sample_rate, num=len(audio)), audio)
plt.imshow(cepstral_coefficents, aspect='auto', origin='lower');

aneeshprakkulam@gmail.com | | |

, , |

Tutorials
Works
- Published
Video
Archive
- 2008
  - January
  - February
  - March
- 2007
  - January
  - February
  - March
- 2006
  - January
  - February
  - March
Forum
About
Contact

'Art is never finished, only abandoned..

Leonardo da Vinci

SPEECH PROCESSING

Understanding Audio data, Fourier Transform, FFT and Spectrogram features for a Speech Recognition System

An introduction to audio data analysis (sound analysis) using python

READ AUDIO FILE

PLAY AUDIO FILE

READ AND PLOT SPEECH DATA

FAST FOURIER TRANSFORM

SPECTROGRAM USING LIBROSA

FFT SIDE

AUDIO PLAY USING PYGAME

MFCC theory and implementation

Theory

Search