SPEECH PROCESSING
Understanding Audio data, Fourier Transform, FFT and Spectrogram features for a Speech Recognition System
An introduction to audio data analysis (sound analysis) using python
Speech processing is the study of speech signals and the processing methods of signals. The signals are usually processed in a digital representation, so speech processing can be regarded as a special case of digital signal processing, applied to speech signals. Aspects of speech processing includes the acquisition, manipulation, storage, transfer and output of speech signals. The input is called speech recognition and the output is called speech synthesis.
READ AUDIO FILE
import wave f=wave.open('/content/drive/My Drive/Colab Notebooks/colab_tutorials/file.wav','rb') data = f.readframes(chunk) |
PLAY AUDIO FILE
import pyaudio #define stream chunk #open a wav format music #play stream #stop stream #close PyAudio |
READ AND PLOT SPEECH DATA
import matplotlib.pyplot as plt from scipy.fftpack import fft from scipy.io import wavfile # get the api import numpy as np fs, data = wavfile.read('/content/drive/MyDrive/Colab Notebooks/colab_tutorials/OAF_base_happy.wav') plt.plot(data,'g') plt.show() |
FAST FOURIER TRANSFORM
from scipy.io import wavfile # get the api import numpy as np fs, data = wavfile.read('file.wav') # load the data a = data.T[0] # this is a two channel soundtrack, I get the first track b=[(ele/2**8.)*2-1 for ele in a] # this is 8-bit track, b is now normalized on [-1,1) c = fft(a) # calculate fourier transform (complex numbers list) d = len(c)/2 # you only need half of the fft list (real signal symmetry) plt.plot(abs(c[:(d-1)]),'r') plt.show() |
SPECTROGRAM USING LIBROSA
|
FFT SIDE
from __future__ import print_function import scipy.io.wavfile as wavfile import scipy import scipy.fftpack import numpy as np from matplotlib import pyplot as plt fs_rate, signal = wavfile.read("/content/drive/My Drive/Colab Notebooks/colab_tutorials/file.wav") print ("Frequency sampling", fs_rate) l_audio = len(signal.shape) print ("Channels", l_audio) if l_audio == 2: signal = signal.sum(axis=1) / 2 N = signal.shape[0] print ("Complete Samplings N", N) secs = N / float(fs_rate) print ("secs", secs) Ts = 1.0/fs_rate # sampling interval in time print ("Timestep between samples Ts", Ts) t = scipy.arange(0, secs, Ts) # time vector as scipy arange field / numpy.ndarray FFT = abs(scipy.fft(signal)) FFT_side = FFT[range(N//2)] # one side FFT range freqs = scipy.fftpack.fftfreq(signal.size, t[1]-t[0]) fft_freqs = np.array(freqs) freqs_side = freqs[range(N//2)] # one side frequency range fft_freqs_side = np.array(freqs_side) plt.subplot(311) p1 = plt.plot(t, signal, "g") # plotting the signal plt.xlabel('Time') plt.ylabel('Amplitude') plt.subplot(312) p2 = plt.plot(freqs, FFT, "r") # plotting the complete fft spectrum plt.xlabel('Frequency (Hz)') plt.ylabel('Count dbl-sided') plt.subplot(313) p3 = plt.plot(freqs_side, abs(FFT_side), "b") # plotting the positive fft spectrum plt.xlabel('Frequency (Hz)') plt.ylabel('Count single-sided') plt.show() |
AUDIO PLAY USING PYGAME
|
MFCC theory and implementation
Theory
Mel Frequency Cepstral Coefficents (MFCCs) is a way of extracting features from an audio. The MFCC uses the MEL scale to divide the frequency band to sub-bands and then extracts the Cepstral Coefficents using Discrete Cosine Transform (DCT). MEL scale is based on the way humans distinguish between frequencies which makes it very convenient to process sounds.
tbody>
def freq_to_mel(freq): return 2595.0 * np.log10(1.0 + freq / 700.0) def met_to_freq(mels): return 700.0 * (10.0**(mels / 2595.0) - 1.0)
def normalize_audio(audio): audio = audio / np.max(np.abs(audio)) return audio def frame_audio(audio, FFT_size=2048, hop_size=10, sample_rate=44100): # hop_size in ms audio = np.pad(audio, int(FFT_size / 2), mode='reflect') frame_len = np.round(sample_rate * hop_size / 1000).astype(int) frame_num = int((len(audio) - FFT_size) / frame_len) + 1 frames = np.zeros((frame_num,FFT_size)) for n in range(frame_num): frames[n] = audio[n*frame_len:n*frame_len+FFT_size] return frames def get_filter_points(fmin, fmax, mel_filter_num, FFT_size, sample_rate=44100): fmin_mel = freq_to_mel(fmin) fmax_mel = freq_to_mel(fmax) print("MEL min: {0}".format(fmin_mel)) print("MEL max: {0}".format(fmax_mel)) mels = np.linspace(fmin_mel, fmax_mel, num=mel_filter_num+2) freqs = met_to_freq(mels) return np.floor((FFT_size + 1) / sample_rate * freqs).astype(int), freqs def get_filters(filter_points, FFT_size): filters = np.zeros((len(filter_points)-2,int(FFT_size/2+1))) for n in range(len(filter_points)-2): filters[n, filter_points[n] : filter_points[n + 1]] = np.linspace(0, 1, filter_points[n + 1] - filter_points[n]) filters[n, filter_points[n + 1] : filter_points[n + 2]] = np.linspace(1, 0, filter_points[n + 2] - filter_points[n + 1]) return filters def dct(dct_filter_num, filter_len): basis = np.empty((dct_filter_num,filter_len)) basis[0, :] = 1.0 / np.sqrt(filter_len) samples = np.arange(1, 2 * filter_len, 2) * np.pi / (2.0 * filter_len) for i in range(1, dct_filter_num): basis[i, :] = np.cos(i * samples) * np.sqrt(2.0 / filter_len) return basis |
Main PGM
import os import numpy as np import scipy from scipy.io import wavfile import scipy.fftpack as fft from scipy.signal import get_window import IPython.display as ipd import matplotlib.pyplot as plt %matplotlib inline
TRAIN_PATH = '../input/audio_train/' ipd.Audio(TRAIN_PATH + "a439d172.wav")
sample_rate, audio = wavfile.read(TRAIN_PATH + "a439d172.wav") print("Sample rate: {0}Hz".format(sample_rate)) print("Audio duration: {0}s".format(len(audio) / sample_rate))
audio = normalize_audio(audio) plt.figure(figsize=(15,4)) plt.plot(np.linspace(0, len(audio) / sample_rate, num=len(audio)), audio) plt.grid(True)
hop_size = 15 #ms FFT_size = 2048 audio_framed = frame_audio(audio, FFT_size=FFT_size, hop_size=hop_size, sample_rate=sample_rate) print("Framed audio shape: {0}".format(audio_framed.shape))
print("First frame:") audio_framed[1]
print("Last frame:") audio_framed[-1]
window = get_window("hann", FFT_size, fftbins=True) plt.figure(figsize=(15,4)) plt.plot(window) plt.grid(True)
audio_win = audio_framed * window ind = 69 plt.figure(figsize=(15,6)) plt.subplot(2, 1, 1) plt.plot(audio_framed[ind]) plt.title('Original Frame') plt.grid(True) plt.subplot(2, 1, 2) plt.plot(audio_win[ind]) plt.title('Frame After Windowing') plt.grid(True)
audio_winT = np.transpose(audio_win) audio_fft = np.empty((int(1 + FFT_size // 2), audio_winT.shape[1]), dtype=np.complex64, order='F') for n in range(audio_fft.shape[1]): audio_fft[:, n] = fft.fft(audio_winT[:, n], axis=0)[:audio_fft.shape[0]] audio_fft = np.transpose(audio_fft)
audio_power = np.square(np.abs(audio_fft)) print(audio_power.shape)
freq_min = 0 freq_high = sample_rate / 2 mel_filter_num = 10 print("Minimum frequency: {0}".format(freq_min)) print("Maximum frequency: {0}".format(freq_high))
filter_points, mel_freqs = get_filter_points(freq_min, freq_high, mel_filter_num, FFT_size, sample_rate=44100) filter_points
filters = get_filters(filter_points, FFT_size) plt.figure(figsize=(15,4)) for n in range(filters.shape[0]): plt.plot(filters[n])
# taken from the librosa library enorm = 2.0 / (mel_freqs[2:mel_filter_num+2] - mel_freqs[:mel_filter_num]) filters *= enorm[:, np.newaxis]
plt.figure(figsize=(15,4)) for n in range(filters.shape[0]): plt.plot(filters[n])
audio_filtered = np.dot(filters, np.transpose(audio_power)) audio_log = 10.0 * np.log10(audio_filtered) audio_log.shape
dct_filter_num = 40 dct_filters = dct(dct_filter_num, mel_filter_num) cepstral_coefficents = np.dot(dct_filters, audio_log) cepstral_coefficents.shape
cepstral_coefficents[:, 0]
plt.figure(figsize=(15,5)) plt.plot(np.linspace(0, len(audio) / sample_rate, num=len(audio)), audio) plt.imshow(cepstral_coefficents, aspect='auto', origin='lower');