de

SPEECH PROCESSING


Understanding Audio data, Fourier Transform, FFT and Spectrogram features for a Speech Recognition System

An introduction to audio data analysis (sound analysis) using python


Speech processing is the study of speech signals and the processing methods of signals. The signals are usually processed in a digital representation, so speech processing can be regarded as a special case of digital signal processing, applied to speech signals. Aspects of speech processing includes the acquisition, manipulation, storage, transfer and output of speech signals. The input is called speech recognition and the output is called speech synthesis.


READ AUDIO FILE

import wave

f=wave.open('/content/drive/My Drive/Colab Notebooks/colab_tutorials/file.wav','rb')

data = f.readframes(chunk)

PLAY AUDIO FILE

import pyaudio  
import wave  

#define stream chunk   
chunk = 1024  

#open a wav format music  
f = wave.open("/home/ihrd/Avigyan/whitenoisegaussian.wav","rb")  
#instantiate PyAudio  
p = pyaudio.PyAudio()  
#open stream  
stream = p.open(format = p.get_format_from_width(f.getsampwidth()),  
                channels = f.getnchannels(),  
                rate = f.getframerate(),  
                output = True)  
#read data  
data = f.readframes(chunk)  

#play stream  
while data:  
    stream.write(data)  
    data = f.readframes(chunk)  

#stop stream  
stream.stop_stream()  
stream.close()  

#close PyAudio  
p.terminate()

READ AND PLOT SPEECH DATA


import matplotlib.pyplot as plt

from scipy.fftpack import fft

from scipy.io import wavfile # get the api

import numpy as np  

fs, data = wavfile.read('/content/drive/MyDrive/Colab Notebooks/colab_tutorials/OAF_base_happy.wav')

plt.plot(data,'g')

plt.show()

FAST FOURIER TRANSFORM


from scipy.io import wavfile # get the api

import numpy as np

fs, data = wavfile.read('file.wav') # load the data

a = data.T[0] # this is a two channel soundtrack, I get the first track

b=[(ele/2**8.)*2-1 for ele in a] # this is 8-bit track, b is now normalized on [-1,1)

c = fft(a) # calculate fourier transform (complex numbers list)

d = len(c)/2  # you only need half of the fft list (real signal symmetry)

plt.plot(abs(c[:(d-1)]),'r') 

plt.show()

SPECTROGRAM USING LIBROSA

_wav_file_ = "africa-toto.wav"

def spectogram_librosa(_wav_file_):
    import librosa
    import pylab
    import numpy as np
    
    (sig, rate) = librosa.load(_wav_file_, sr=None, mono=True,  dtype=np.float32)
    pylab.specgram(sig, Fs=rate)
    pylab.savefig('spectrogram3.png')

def graph_spectrogram_wave(wav_file):
    import wave
    import pylab
    def get_wav_info(wav_file):
        wav = wave.open(wav_file, 'r')
        frames = wav.readframes(-1)
        sound_info = pylab.fromstring(frames, 'int16')
        frame_rate = wav.getframerate()
        wav.close()
        return sound_info, frame_rate
    sound_info, frame_rate = get_wav_info(wav_file)
    pylab.figure(num=3, figsize=(10, 6))
    pylab.title('spectrogram pylab with wav_file')
    pylab.specgram(sound_info, Fs=frame_rate)
    pylab.savefig('spectrogram2.png')


def graph_wavfileread(_wav_file_):
    import matplotlib.pyplot as plt
    from scipy import signal
    from scipy.io import wavfile
    import numpy as np   
    sample_rate, samples = wavfile.read(_wav_file_)   
    frequencies, times, spectrogram = signal.spectrogram(samples,sample_rate,nfft=1024)
    plt.pcolormesh(times, frequencies, 10*np.log10(spectrogram))
    plt.ylabel('Frequency [Hz]')
    plt.xlabel('Time [sec]')
    plt.savefig("spectogram1.png")
    

spectogram_librosa(_wav_file_)
#graph_wavfileread(_wav_file_)
#graph_spectrogram_wave(_wav_file_)

FFT SIDE 

from __future__ import print_function

import scipy.io.wavfile as wavfile

import scipy

import scipy.fftpack

import numpy as np

from matplotlib import pyplot as plt

fs_rate, signal = wavfile.read("/content/drive/My Drive/Colab Notebooks/colab_tutorials/file.wav")

print ("Frequency sampling", fs_rate)

l_audio = len(signal.shape)

print ("Channels", l_audio)

if l_audio == 2:

    signal = signal.sum(axis=1) / 2

N = signal.shape[0]

print ("Complete Samplings N", N)

secs = N / float(fs_rate)

print ("secs", secs)

Ts = 1.0/fs_rate # sampling interval in time

print ("Timestep between samples Ts", Ts)

t = scipy.arange(0, secs, Ts) # time vector as scipy arange field / numpy.ndarray

FFT = abs(scipy.fft(signal))

FFT_side = FFT[range(N//2)] # one side FFT range

freqs = scipy.fftpack.fftfreq(signal.size, t[1]-t[0])

fft_freqs = np.array(freqs)

freqs_side = freqs[range(N//2)] # one side frequency range

fft_freqs_side = np.array(freqs_side)

plt.subplot(311)

p1 = plt.plot(t, signal, "g") # plotting the signal

plt.xlabel('Time')

plt.ylabel('Amplitude')

plt.subplot(312)

p2 = plt.plot(freqs, FFT, "r") # plotting the complete fft spectrum

plt.xlabel('Frequency (Hz)')

plt.ylabel('Count dbl-sided')

plt.subplot(313)

p3 = plt.plot(freqs_side, abs(FFT_side), "b") # plotting the positive fft spectrum

plt.xlabel('Frequency (Hz)')

plt.ylabel('Count single-sided')

plt.show()

AUDIO PLAY USING PYGAME

import pygame
file = 'rp.mp3'
pygame.init()
pygame.mixer.init()
pygame.mixer.music.load(file)
pygame.mixer.music.play()
while pygame.mixer.music.get_busy(): 
    pygame.time.Clock().tick(10)
 

MFCC theory and implementation

Theory

Mel Frequency Cepstral Coefficents (MFCCs) is a way of extracting features from an audio. The MFCC uses the MEL scale to divide the frequency band to sub-bands and then extracts the Cepstral Coefficents using Discrete Cosine Transform (DCT). MEL scale is based on the way humans distinguish between frequencies which makes it very convenient to process sounds.


tbody>
		
def freq_to_mel(freq): return 2595.0 * np.log10(1.0 + freq / 700.0) def met_to_freq(mels): return 700.0 * (10.0**(mels / 2595.0) - 1.0)
def normalize_audio(audio): audio = audio / np.max(np.abs(audio)) return audio
def frame_audio(audio, FFT_size=2048, hop_size=10, sample_rate=44100):
    # hop_size in ms
    
    audio = np.pad(audio, int(FFT_size / 2), mode='reflect')
    frame_len = np.round(sample_rate * hop_size / 1000).astype(int)
    frame_num = int((len(audio) - FFT_size) / frame_len) + 1
    frames = np.zeros((frame_num,FFT_size))
    
    for n in range(frame_num):
        frames[n] = audio[n*frame_len:n*frame_len+FFT_size]
    
    return frames
def get_filter_points(fmin, fmax, mel_filter_num, FFT_size, sample_rate=44100):
    fmin_mel = freq_to_mel(fmin)
    fmax_mel = freq_to_mel(fmax)
    
    print("MEL min: {0}".format(fmin_mel))
    print("MEL max: {0}".format(fmax_mel))
    
    mels = np.linspace(fmin_mel, fmax_mel, num=mel_filter_num+2)
    freqs = met_to_freq(mels)
    
    return np.floor((FFT_size + 1) / sample_rate * freqs).astype(int), freqs
def get_filters(filter_points, FFT_size):
    filters = np.zeros((len(filter_points)-2,int(FFT_size/2+1)))
    
    for n in range(len(filter_points)-2):
        filters[n, filter_points[n] : filter_points[n + 1]] = np.linspace(0, 1, filter_points[n + 1] - filter_points[n])
        filters[n, filter_points[n + 1] : filter_points[n + 2]] = np.linspace(1, 0, filter_points[n + 2] - filter_points[n + 1])
    
    return filters
def dct(dct_filter_num, filter_len):
    basis = np.empty((dct_filter_num,filter_len))
    basis[0, :] = 1.0 / np.sqrt(filter_len)
    
    samples = np.arange(1, 2 * filter_len, 2) * np.pi / (2.0 * filter_len)

    for i in range(1, dct_filter_num):
        basis[i, :] = np.cos(i * samples) * np.sqrt(2.0 / filter_len)
        
    return basis

Main PGM

import os
import numpy as np
import scipy
from scipy.io import wavfile
import scipy.fftpack as fft
from scipy.signal import get_window
import IPython.display as ipd
import matplotlib.pyplot as plt

%matplotlib inline

TRAIN_PATH = '../input/audio_train/'
ipd.Audio(TRAIN_PATH + "a439d172.wav")

sample_rate, audio = wavfile.read(TRAIN_PATH + "a439d172.wav")
print("Sample rate: {0}Hz".format(sample_rate))
print("Audio duration: {0}s".format(len(audio) / sample_rate))

audio = normalize_audio(audio)
plt.figure(figsize=(15,4))
plt.plot(np.linspace(0, len(audio) / sample_rate, num=len(audio)), audio)
plt.grid(True)

hop_size = 15 #ms
FFT_size = 2048

audio_framed = frame_audio(audio, FFT_size=FFT_size, hop_size=hop_size, sample_rate=sample_rate)
print("Framed audio shape: {0}".format(audio_framed.shape))

print("First frame:")
audio_framed[1]

print("Last frame:")
audio_framed[-1]

window = get_window("hann", FFT_size, fftbins=True)
plt.figure(figsize=(15,4))
plt.plot(window)
plt.grid(True)

audio_win = audio_framed * window

ind = 69
plt.figure(figsize=(15,6))
plt.subplot(2, 1, 1)
plt.plot(audio_framed[ind])
plt.title('Original Frame')
plt.grid(True)
plt.subplot(2, 1, 2)
plt.plot(audio_win[ind])
plt.title('Frame After Windowing')
plt.grid(True)

audio_winT = np.transpose(audio_win)

audio_fft = np.empty((int(1 + FFT_size // 2), audio_winT.shape[1]), dtype=np.complex64, order='F')

for n in range(audio_fft.shape[1]):
    audio_fft[:, n] = fft.fft(audio_winT[:, n], axis=0)[:audio_fft.shape[0]]

audio_fft = np.transpose(audio_fft)

audio_power = np.square(np.abs(audio_fft))
print(audio_power.shape)

freq_min = 0
freq_high = sample_rate / 2
mel_filter_num = 10

print("Minimum frequency: {0}".format(freq_min))
print("Maximum frequency: {0}".format(freq_high))

filter_points, mel_freqs = get_filter_points(freq_min, freq_high, mel_filter_num, FFT_size, sample_rate=44100)
filter_points

filters = get_filters(filter_points, FFT_size)

plt.figure(figsize=(15,4))
for n in range(filters.shape[0]):
    plt.plot(filters[n])

# taken from the librosa library
enorm = 2.0 / (mel_freqs[2:mel_filter_num+2] - mel_freqs[:mel_filter_num])
filters *= enorm[:, np.newaxis]

plt.figure(figsize=(15,4))
for n in range(filters.shape[0]):
    plt.plot(filters[n])

audio_filtered = np.dot(filters, np.transpose(audio_power))
audio_log = 10.0 * np.log10(audio_filtered)
audio_log.shape

dct_filter_num = 40

dct_filters = dct(dct_filter_num, mel_filter_num)

cepstral_coefficents = np.dot(dct_filters, audio_log)
cepstral_coefficents.shape

cepstral_coefficents[:, 0]

plt.figure(figsize=(15,5))
plt.plot(np.linspace(0, len(audio) / sample_rate, num=len(audio)), audio)
plt.imshow(cepstral_coefficents, aspect='auto', origin='lower');