Keras CNN Sound Classify #5

Keras CNN で音の分類 #5。update 2020.5.25

ディープラーニングで音声分類　のサンプルが出ていたので、試してみましたの #5です。
変更
1) 学習データのメルスペクトグラム方法を、変更しました。 by nishi 2020.5.25
Keras CNN Sound Classify
で、オリジナルができたので、同じ要領で、今度は、
Inception V3 を使って、同じ学習データ、テストデータを使って試してみました。

開発環境
OS: Windwos10
GPU: GeForece GTX 1070 8GB
Anaconda
Python3.6
TensorFlow 1.14.0 GPU
Keras 2.3.1

注1) TensorFlow 1.14.0 を使っているのは、 K210 の YOLO v2 等で、
使っているので、他意はありません。
注2) Keras は、オリジナルを使っています。
tensorflow.python.keras を敢えて使っていません。

手順は、上記ページに従って行いました。

1.今回のテストと学習用のクラス、スペクトラムデータを、npz で作成。
プログラムは下記になります。
今回は、メルスペクトグラムのパラメータを変えて、データを、 (128,1723,1) → (256,862,1) にして見ました。
後、使う学習データは、 raw,wn,ss の3種類で試してみました。

load-dataset4.py
# -*- coding:utf-8 -*- ''' sound classify load-dataset4.py https://qiita.com/cvusk/items/61cdbce80785eaf28349 update 2020.5.17 by nishi ''' import os import random import numpy as np import pandas as pd import librosa import librosa.display import matplotlib.pyplot as plt import seaborn as sn from sklearn import model_selection from sklearn import preprocessing import IPython.display as ipd np.random.seed(0) # define directories #base_dir = "./" base_dir = "/tmp/ESC-50" esc_dir = os.path.join(base_dir, "ESC-50-master") meta_file = os.path.join(esc_dir, "meta/esc50.csv") audio_dir = os.path.join(esc_dir, "audio/") # load metadata meta_data = pd.read_csv(meta_file) # get data size data_size = meta_data.shape print(data_size) # arrange target label and its name class_dict = {} for i in range(data_size[0]): if meta_data.loc[i,"target"] not in class_dict.keys(): class_dict[meta_data.loc[i,"target"]] = meta_data.loc[i,"category"] #---------- # load a wave data def load_wave_data(audio_dir, file_name): file_path = os.path.join(audio_dir, file_name) x, fs = librosa.load(file_path, sr=44100) return x,fs # change wave data to mel-stft def calculate_melsp_org(x, n_fft=1024, hop_length=128): stft = np.abs(librosa.stft(x, n_fft=n_fft, hop_length=hop_length))**2 log_stft = librosa.power_to_db(stft) melsp = librosa.feature.melspectrogram(S=log_stft,n_mels=128) return melsp # change by nishi 2020.4.2 def calculate_melsp(x, n_fft=1024, hop_length=128,n_mels=128): stft = np.abs(librosa.stft(x, n_fft=n_fft, hop_length=hop_length))**2 #log_stft = librosa.power_to_db(stft) # こちらが、オリジナル log_stft = librosa.power_to_db(stft,ref=0.0) melsp = librosa.feature.melspectrogram(S=log_stft,n_mels=n_mels) #melsp -= melsp.min() melsp /= melsp.max() return melsp # display wave in plots def show_wave(x): plt.plot(x) plt.show() # display wave in heatmap def show_melsp(melsp, fs): plt.figure(figsize=(14, 5)) #librosa.display.specshow(melsp, sr=fs) librosa.display.specshow(melsp, sr=fs, x_axis="time", y_axis="mel", hop_length=128) plt.colorbar(format='%+2.0f dB') plt.show() # example data x, fs = load_wave_data(audio_dir, meta_data.loc[0,"filename"]) #print('x.dtype=',x.dtype) melsp = calculate_melsp(x) print("wave size:{0}\nmelsp size:{1}\nsamping rate:{2}".format(x.shape, melsp.shape, fs)) show_wave(x) show_melsp(melsp, fs) #sys.exit() #------------ # data augmentation: add white noise def add_white_noise(x, rate=0.002): return x + rate*np.random.randn(len(x)) # data augmentation: shift sound in timeframe def shift_sound(x, rate=2): return np.roll(x, int(len(x)//rate)) # data augmentation: stretch sound def stretch_sound(x, rate=1.1): input_length = len(x) x = librosa.effects.time_stretch(x, rate) if len(x)>input_length: return x[:input_length] else: return np.pad(x, (0, max(0, input_length - len(x))), "constant") # add by nishi def freq_mask(melsp): freq,time_s = melsp.shape #print('freq=',freq) #print('time_s=',time_s) w=random.randint(3,8) i=random.randint(0,freq-w) m = melsp.min() melsp[i:i+w]=m return melsp def time_mask(melsp): freq,time_s = melsp.shape #print('freq=',freq) #print('time_s=',time_s) w=random.randint(50,120) i=random.randint(0,time_s-w) m = melsp.min() melsp[:,i:i+w]=m return melsp def freq_time_mask(melsp): _melsp=freq_mask(melsp) return time_mask(_melsp) def freq_slide(melsp): freq,time_s = melsp.shape direct = random.randint(-1,1) # slide direction w=random.randint(3,6) i=random.randint(0,freq-w*3) m = melsp.min() if direct >= 0: # up slide #print('up slide') mel_slid=melsp[i:freq-w] melsp[i:i+w]=m melsp[i+w:]=mel_slid else: # down slide #print('down slide') mel_slid=melsp[i+w:freq] melsp[freq-w:freq]=m melsp[i:freq-w]=mel_slid #print(melsp.shape) return melsp def freq_mask2(melsp): _melsp = freq_mask(melsp) _melsp = freq_mask(_melsp) return _melsp def time_mask2(melsp): _melsp = time_mask(melsp) _melsp = time_mask(_melsp) return _melsp def freq_time_mask2(melsp): _melsp=freq_mask2(melsp) return time_mask2(_melsp) def freq_slide2(melsp): _melsp = freq_slide(melsp) _melsp = freq_slide(_melsp) return _melsp def freq_slide_time_mask(melsp): _melsp = freq_slide(melsp) _melsp = time_mask(_melsp) return _melsp #-------------- import librosa import librosa.display # change wave data to mel-stft def calculate_melsp_not_use(x, n_fft=1024, hop_length=128): stft = np.abs(librosa.stft(x, n_fft=n_fft, hop_length=hop_length))**2 log_stft = librosa.power_to_db(stft) melsp = librosa.feature.melspectrogram(S=log_stft,n_mels=128) return melsp #----------- # get training dataset and target dataset x = list(meta_data.loc[:,"filename"]) y = list(meta_data.loc[:, "target"]) x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.25, stratify=y) print("x train:{0}\ny train:{1}\nx test:{2}\ny test:{3}".format(len(x_train), len(y_train), len(x_test), len(y_test))) """output x train:1500 y train:1500 x test:500 y test:500 """ # showing the classes are equally splitted a = np.zeros(50) for c in y_test: a[c] += 1 print(a) """output [10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10.] """ #----------- #freq = 128 freq = 256 #time = 1723 time = 862 # save wave data in npz, with augmentation def save_np_data(filename, x, y, aug=None,aug2=None, rates=None): np_data = np.zeros(freq*time*len(x)).reshape(len(x), freq, time) np_targets = np.zeros(len(y)) for i in range(len(y)): _x, fs = load_wave_data(audio_dir, x[i]) if aug is not None: _x = aug(x=_x, rate=rates[i]) _x = calculate_melsp(_x,hop_length=256,n_mels=256) if aug2 is not None: _x = aug2(_x) np_data[i] = _x np_targets[i] = y[i] np.savez(filename, x=np_data, y=np_targets) # save test dataset if not os.path.exists("esc_melsp_test.npz"): print('0. esc_melsp_test.npz') save_np_data("esc_melsp_test.npz", x_test, y_test) # save raw training dataset if not os.path.exists("esc_melsp_train_raw.npz"): print('1. esc_melsp_train_raw.npz') save_np_data("esc_melsp_train_raw.npz", x_train, y_train) # save training dataset with white noise if True: if not os.path.exists("esc_melsp_train_wn.npz"): print('2. esc_melsp_train_wn.npz') rates = np.random.randint(1,50,len(x_train))/10000 save_np_data("esc_melsp_train_wn.npz", x_train, y_train, aug=add_white_noise, rates=rates) # save training dataset with sound shift if not os.path.exists("esc_melsp_train_ss.npz"): print('3. esc_melsp_train_ss.npz') rates = np.random.choice(np.arange(2,6),len(y_train)) save_np_data("esc_melsp_train_ss.npz", x_train, y_train, aug=shift_sound, rates=rates) # save training dataset with stretch if False: if not os.path.exists("esc_melsp_train_st.npz"): print('4. esc_melsp_train_st.npz') rates = np.random.choice(np.arange(80,120),len(y_train))/100 save_np_data("esc_melsp_train_st.npz", x_train, y_train, aug=stretch_sound, rates=rates) # save training dataset with combination of white noise and shift or stretch if False: if not os.path.exists("esc_melsp_train_com.npz"): print('5. esc_melsp_train_com.npz') np_data = np.zeros(freq*time*len(x_train)).reshape(len(x_train), freq, time) np_targets = np.zeros(len(y_train)) for i in range(len(y_train)): x, fs = load_wave_data(audio_dir, x_train[i]) x = add_white_noise(x=x, rate=np.random.randint(1,50)/1000) if np.random.choice((True,False)): x = shift_sound(x=x, rate=np.random.choice(np.arange(2,6))) else: x = stretch_sound(x=x, rate=np.random.choice(np.arange(80,120))/100) x = calculate_melsp(x) np_data[i] = x np_targets[i] = y_train[i] np.savez("esc_melsp_train_com.npz", x=np_data, y=np_targets) # save training dataset with freq mask if False: if not os.path.exists("esc_melsp_train_fmk.npz"): print('6. esc_melsp_train_fmk.npz') save_np_data("esc_melsp_train_fmk.npz", x_train, y_train,aug2=freq_mask) # save training dataset with time mask if False: if not os.path.exists("esc_melsp_train_tmk.npz"): print('7. esc_melsp_train_tmk.npz') save_np_data("esc_melsp_train_tmk.npz", x_train, y_train,aug2=time_mask) # save training dataset with freq and time mask if False: if not os.path.exists("esc_melsp_train_ftmk.npz"): print('8. esc_melsp_train_ftmk.npz') save_np_data("esc_melsp_train_ftmk.npz", x_train, y_train,aug2=freq_time_mask) # save training dataset with freq slide if False: if not os.path.exists("esc_melsp_train_fsl.npz"): print('9. esc_melsp_train_fsl.npz') save_np_data("esc_melsp_train_fsl.npz", x_train, y_train,aug2=freq_slide) # save training dataset with freq slide and time mask if False: if not os.path.exists("esc_melsp_train_fsl_tmk.npz"): print('10. esc_melsp_train_fsl_tmk.npz') save_np_data("esc_melsp_train_fsl_tmk.npz", x_train, y_train,aug2=freq_slide_time_mask)

2. 学習の実行
プログラムは、下記になります。

今回は、float32 を使います。
GPUを GTX1070 8GByte にしたので学習できます。

今回は、
val_acc/loss=0.8100/0.9055 acc/loss=0.9740/0.5329 280 epoch です。

train-incept.py
''' sound classify train-incept.py from keras.applications import MobileNetV2 https://qiita.com/cvusk/items/61cdbce80785eaf28349 ''' import os import random import numpy as np #import pandas as pd #import tensorflow as tf #import tensorflow.keras.backend as K import keras #from keras.applications import VGG16 from keras.applications.inception_v3 import InceptionV3 #from keras.applications.xception import Xception #from keras.applications.resnet50 import ResNet50 from keras.models import Model from keras.layers import Input, Dense, Dropout, Activation from keras.layers import Conv2D, GlobalAveragePooling2D from keras.layers import BatchNormalization, Add from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint from keras.optimizers import Adam import matplotlib.pyplot as plt np.random.seed(0) #test_name='test4' #test_name='test4f' #test_name='test4g2' test_name='test4h' #config = tf.ConfigProto() #config.gpu_options.allow_growth = True #K.set_session(tf.Session(config=config)) # https://www.haya-programming.com/entry/2018/12/28/004211 if test_name=='test4h': npz_list = ['esc_melsp_test.npz', 'esc_melsp_train_raw.npz', 'esc_melsp_train_wn.npz', 'esc_melsp_train_ss.npz' ] elif test_name=='test4' or test_name=='test4f': npz_list = ['esc_melsp_test.npz', 'esc_melsp_train_raw.npz', #'esc_melsp_train_wn.npz', 'esc_melsp_train_ss.npz', 'esc_melsp_train_fmk.npz', 'esc_melsp_train_tmk.npz', 'esc_melsp_train_ftmk.npz', 'esc_melsp_train_fsl.npz', 'esc_melsp_train_fsl_tmk.npz' #'esc_melsp_train_st.npz', #'esc_melsp_train_com.npz' ] elif test_name=='test4g2': npz_list = ['esc_melsp_test.npz', 'esc_melsp_train_raw.npz', 'esc_melsp_train_wn.npz', 'esc_melsp_train_ss.npz', 'esc_melsp_train_fmk.npz', 'esc_melsp_train_tmk.npz', 'esc_melsp_train_ftmk.npz', 'esc_melsp_train_fsl.npz', 'esc_melsp_train_fsl_tmk.npz' #'esc_melsp_train_st.npz', #'esc_melsp_train_com.npz' ] classes = 50 test_id=4 dropout=0.5 if test_name=='test4': log_dir = 'logs/vgg16-test4_f16/' elif test_name=='test4f': log_dir = 'logs/inceptV3-test4f_f16/' elif test_name=='test4g2': log_dir = 'logs/inceptV3-test4g2_f16/' elif test_name=='test4h': log_dir = 'logs/inceptV3-test4h2_f32/' class MixupGenerator(): def __init__(self, X_train, y_train, batch_size=32, alpha=0.2, shuffle=True, datagen=None): self.X_train = X_train self.y_train = y_train self.batch_size = batch_size self.alpha = alpha self.shuffle = shuffle self.sample_num = len(X_train) self.datagen = datagen def __call__(self): while True: indexes = self.__get_exploration_order() itr_num = int(len(indexes) // (self.batch_size * 2)) for i in range(itr_num): batch_ids = indexes[i * self.batch_size * 2:(i + 1) * self.batch_size * 2] X, y = self.__data_generation(batch_ids) yield X, y def __get_exploration_order(self): indexes = np.arange(self.sample_num) if self.shuffle: np.random.shuffle(indexes) return indexes def __data_generation(self, batch_ids): _, h, w, c = self.X_train.shape _, class_num = self.y_train.shape X1 = self.X_train[batch_ids[:self.batch_size]] X2 = self.X_train[batch_ids[self.batch_size:]] y1 = self.y_train[batch_ids[:self.batch_size]] y2 = self.y_train[batch_ids[self.batch_size:]] l = np.random.beta(self.alpha, self.alpha, self.batch_size) X_l = l.reshape(self.batch_size, 1, 1, 1) y_l = l.reshape(self.batch_size, 1) X = X1 * X_l + X2 * (1 - X_l) y = y1 * y_l + y2 * (1 - y_l) if self.datagen: for i in range(self.batch_size): X[i] = self.datagen.random_transform(X[i]) return X, y def load_npz(path,classes=50, test=4): npz_dt= np.load(path) #print('list(npz_dt)=',list(npz_dt)) x = npz_dt['x'].astype('float32') #x = npz_dt['x'].astype('float16') # data type convert float63 -> float32 #x = x.astype('float32') #x=x[:,:,:344] x_shape = x.shape #print(x.shape) #sys.exit() # reshape # (-1,128,1723) -> (-1,128,1723,1) x = np.reshape(x, (-1,x_shape[1],x_shape[2],1)) # normalize if test == 2: x_max=x.max() x_min=x.min() if x_min < 0.0 : x_min *= -1.0 x += x_min x_max += x_min #x /= x_max x /= 6.5 #if x_max > 4.0: print('>x_max=',x_max) if test ==3: x /= 80.0 x_max=x.max() x_min=x.min() print('>x_max=',x_max) print('>x_min=',x_min) if test ==4: x_max=x.max() x_min=x.min() print('>x_max=',x_max) print('>x_min=',x_min) y = npz_dt['y'] # redefine target data into one hot vector y = keras.utils.to_categorical(y, classes) return x ,y # load test data x_test,y_test = load_npz(npz_list[0],classes=classes,test=test_id) # define CNN #print('x_train.shape=',x_train.shape) #x_train.shape= (1500, 128, 1723, 1) print('>x_test.shape=',x_test.shape) print('>x_test.dtype=',x_test.dtype) #model =MobileNet(weights=None,input_shape=(128,1723,1),classes=classes,dropout=dropout) #model = MobileNetV2(weights=None,alpha=0.5,input_shape=(128,1723,1),classes=classes) #model = VGG16(include_top=True, weights=None, input_tensor=None, input_shape=(128,1723,1), pooling=None, classes=classes) model = InceptionV3(include_top=True, weights=None, input_tensor=None, input_shape=(x_test.shape[1:]), pooling=None, classes=classes) #model = InceptionV3(include_top=True, weights=None, input_tensor=None, input_shape=(128,1723,1), pooling=None, classes=classes) #model = InceptionV3(include_top=True, weights=None, input_tensor=None, input_shape=(256,862,1), pooling=None, classes=classes) #logging = TensorBoard(log_dir=log_dir,write_graph=True) logging = TensorBoard(log_dir=log_dir) s_cnt=1 e_cnt=s_cnt+1 epochs=4 next_epochs=epochs*3*(s_cnt -1)+epochs*0 batch_size=6 #batch_size=4 if test_name=='test4': weights_load_path = 'sound_trained_weights-vgg16-test3_f16-1_3.h5' weights_sav_path = 'sound_trained_weights-vgg16-test3_f16' elif test_name=='test4f': weights_load_path = 'sound_trained_weights-inceptv3-test4f_f16-12_7.h5' weights_sav_path = 'sound_trained_weights-inceptv3-test4f_f16' elif test_name=='test4g2': weights_load_path = 'sound_trained_weights-inceptv3-test4g2_f16-12_8.h5' weights_sav_path = 'sound_trained_weights-inceptv3-test4g2_f16' elif test_name=='test4h': weights_load_path = 'sound_trained_weights-inceptv3-test4h2_f32-1_3.h5' weights_sav_path = 'sound_trained_weights-inceptv3-test4h2_f32' if False: model.load_weights(weights_load_path, by_name=True) if s_cnt >= 1000: # initiate Adam optimizer #opt = Adam(lr=0.00001, decay=1e-6, amsgrad=True) #opt=Adam(lr=1e-5) # 0.00001 #opt=Adam(lr=1e-6) # 0.000001 opt=Adam(lr=0.0000025) elif s_cnt >= 700: # initiate Adam optimizer #opt = Adam(lr=0.00001, decay=1e-6, amsgrad=True) #opt=Adam(lr=1e-5) # 0.00001 #opt=Adam(lr=1e-6) # 0.000001 opt=Adam(lr=0.000005) else: opt=Adam(lr=1e-5) # 0.00001 #opt=Adam(lr=1e-6) # 0.000001 # Let's train the model using Adam with amsgrad model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) #model.summary() #sys.exit() for cnt in range(s_cnt,e_cnt): for id in range(1,4): #for id in range(3,4): #for id in range(1,8): #for id in range(5,8): #for id in range(1,9): #for id in range(2,9): print('npz=',npz_list[id]) # load train data x_train,y_train = load_npz(npz_list[id],classes=classes,test=test_id) if False: hist=model.fit(x_train,y_train, #epochs=50, epochs=next_epochs+epochs, initial_epoch=next_epochs, batch_size=batch_size, validation_data=(x_test, y_test), callbacks=[logging]) if True: training_generator = MixupGenerator(x_train, y_train,batch_size=batch_size)() hist=model.fit_generator(generator=training_generator, steps_per_epoch=x_train.shape[0] // batch_size, validation_data=(x_test, y_test), epochs=next_epochs+epochs, initial_epoch=next_epochs, # verbose=1, callbacks=[logging]) sav_p= "%s-%d_%d.h5" % (weights_sav_path,cnt, id) print('sav_p=',sav_p) model.save_weights(sav_p) #------------- if False: hist_dic=hist.history #print(hist_dic.keys()) #dict_keys(['val_loss', 'val_accuracy', 'loss', 'accuracy'] acc = hist_dic['accuracy'] val_acc = hist_dic['val_accuracy'] loss = hist_dic['loss'] val_loss = hist_dic['val_loss'] epochs = range(1, len(acc) + 1) # "bo" is for "blue dot" plt.plot(epochs, loss, 'bo', label='Training loss') # b is for "solid blue line" plt.plot(epochs, val_loss, 'b', label='Validation loss') plt.title('Training and validation loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.show() plt.clf() plt.plot(epochs, acc, 'bo', label='Training acc') # b is for "solid blue line" plt.plot(epochs, val_acc, 'b', label='Validation acc') plt.title('Training and validation accuracy') plt.xlabel('Epochs') plt.ylabel('Accuracy') plt.legend() plt.show() x_train=None y_train=None next_epochs+=epochs

2. モデルのアプリ
学習済みモデルを使った、PCのマイク入力による Sound Classify プログラムは、下記になります。

PCのスピーカから、ECS-50　のSound の音を出して、下記プログラムにPCのマイク経由で音を与えれば、クラスの分類が出来ます。

結構当たりますが、飛行機、ヘリコプター、エンジンなどの、細かい音が一杯入っている音は苦手な、ような気がします。

注1) 別のテストで、ESC-50 のデータを読み込んで、直接判定させてみる、飛行機、ヘリコプター、エンジンの音だけでの検証で、
110/120 → 0.916667 になるので、どうやら、スピーカ → マイク間で音が悪くなっているのが原因のようです。

注2) 今回は、学習データにwn(white noise) を入れたので、ある程度は、PCのノイズには強いようですが、やはり、マイクがPCのファンの音を拾わないようにした方が良いと思います。

注3) 下記のソースで、dbl_off = 2 を設定すると、入力データを、0.4 sec ずらしてダブルでのmodel.predict() を行えます。

sound-predict3-incept-v3.py
# -*- coding:utf-8 -*- ''' Created on 2020/01/19 update on 2020/05/26 sound-predict3-incept-v3.py https://qiita.com/__Attsun__/items/e033d689c336315435b3 https://musicinformationretrieval.com/ipython_audio.html @author: nishi ''' import keras from keras.models import Model #from keras.applications import MobileNetV2 from keras.applications.inception_v3 import InceptionV3 from keras.layers import Input, Dense, Dropout, Activation from keras.layers import Conv2D, GlobalAveragePooling2D from keras.layers import BatchNormalization, Add from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint import matplotlib.pyplot as plt import librosa import librosa.display import numpy as np #import matplotlib as mpl import pyaudio #import os np.random.seed(0) #test_name='test4f' test_name='test4h' test_id=4 class_indx={ 0: 'dog', 1: 'rooster', 2: 'pig', 3: 'cow', 4: 'frog', 5: 'cat', 6: 'hen', 7: 'insects', 8: 'sheep', 9: 'crow', 10: 'rain', 11: 'sea_waves', 12: 'crackling_fire', 13: 'crickets', 14: 'chirping_birds', 15: 'water_drops', 16: 'wind', 17: 'pouring_water', 18: 'toilet_flush', 18: 'toilet_flush', 19: 'thunderstorm', 20: 'crying_baby', 21: 'sneezing', 22: 'clapping', 23: 'breathing', 24: 'coughing', 25: 'footsteps', 26: 'laughing', 27: 'brushing_teeth', 28: 'snoring', 29: 'drinking_sipping', 30: 'door_wood_knock', 31: 'mouse_click', 32: 'keyboard_typing', 33: 'door_wood_creaks', 34: 'can_opening', 35: 'washing_machine', 36: 'vacuum_cleaner', 37: 'clock_alarm', 38: 'clock_tick', 39: 'glass_breaking', 40: 'helicopter', 41: 'chainsaw', 42: 'siren', 43: 'car_horn', 44: 'engine', 45: 'train', 46: 'church_bells', 47: 'airplane', 48: 'fireworks', 49: 'hand_saw' } #f_list=['1-137-A-32.wav','1-13572-A-46.wav','1-50455-A-44.wav'] # load a wave data def load_wave_data(audio_dir, file_name): file_path = os.path.join(audio_dir, file_name) x, fs = librosa.load(file_path, sr=44100) return x,fs # change wave data to mel-stft def calculate_melsp_org(x, n_fft=1024, hop_length=128): stft = np.abs(librosa.stft(x, n_fft=n_fft, hop_length=hop_length))**2 log_stft = librosa.power_to_db(stft) melsp = librosa.feature.melspectrogram(S=log_stft,n_mels=128) return melsp # for load-datasep.y 改 2020.4.2 def calculate_melsp1(x, n_fft=1024, hop_length=128,n_mels=128): stft = np.abs(librosa.stft(x, n_fft=n_fft, hop_length=hop_length))**2 #log_stft = librosa.power_to_db(stft) # こちらが、オリジナル log_stft = librosa.power_to_db(stft,ref=0.0) melsp = librosa.feature.melspectrogram(S=log_stft,n_mels=n_mels) #melsp -= melsp.min() melsp /= melsp.max() return melsp # for load-dataset2.py def calculate_melsp2(x, n_fft=1024, hop_length=128,sr=44100): S = librosa.feature.melspectrogram(y=x,sr=sr, n_mels=128,n_fft=n_fft, hop_length=hop_length) db_D = librosa.amplitude_to_db(S, ref=0.0) db_D -= db_D.min() return db_D def load_melsp(dx,test=3): dx=dx[np.newaxis,:,:] #print(dx.shape) #print(dx.dtype) #x = x.astype('float32') # data type convert float63 -> float32 #x = x.astype('float32') dx_shape = dx.shape # reshape # (-1,128,1723) -> (-1,128,1723,1) dx = np.reshape(dx, (-1,dx_shape[1],dx_shape[2],1)) print(dx.shape) # normalize if test == 2: dx_max=dx.max() dx_min=dx.min() print('>dx_max=',dx_max,', dx_min=',dx_min) if dx_min < 0.0 : dx_min *= -1.0 dx += dx_min dx_max += dx_min #dx /= dx_max dx /= 6.5 #if dx_max > 4.0: print('>dx_max=',dx_max) if test==3: dx /= 80.0 #dx /= 4.1 dx_max=dx.max() dx_min=dx.min() print('>dx_max=',dx_max) print('>dx_min=',dx_min) if test ==4: dx_max=dx.max() dx_min=dx.min() print('>dx_max=',dx_max) print('>dx_min=',dx_min) return dx classes = 50 #input_shape=(128,1379,1) input_shape=(128,1723,1) if test_name=='test4h': input_shape=(256,862,1) dx_mx=input_shape[1] #model = InceptionV3(include_top=True, weights=None, input_tensor=None, input_shape=(128,1723,1), pooling=None, classes=classes) model = InceptionV3(include_top=True, weights=None, input_tensor=None, input_shape=input_shape, pooling=None, classes=classes) #weights_load_path = 'save/sound_trained_weights-inceptv3-test4_f16-37_3.h5' #weights_load_path = 'save/sound_trained_weights-inceptv3-test4b_f16-39_3.h5' #weights_load_path = 'save/sound_trained_weights-inceptv3-test4c_f16-5_6.h5' #weights_load_path = 'save/sound_trained_weights-inceptv3-test4e_f16-6_5.h5' if test_name=='test4f': weights_load_path = 'save/sound_trained_weights-inceptv3-test4f_f16-12_5.h5' #weights_load_path = 'save/sound_trained_weights-inceptv3-test4g2_f16-8_2.h5' if test_name=='test4h': #weights_load_path = 'save/sound_trained_weights-inceptv3-test4h_f32-19_1.h5' #weights_load_path = 'save/sound_trained_weights-inceptv3-test4h2_f32-24_1.h5' weights_load_path = 'save/sound_trained_weights-inceptv3-test4h3_f32-9_3.h5' #weights_load_path = 'save/sound_trained_weights-inceptv3-test4h4_f32-11_1.h5' print('load',weights_load_path) model.load_weights(weights_load_path, by_name=True) #CHUNK=1024 #CHUNK=44100 #CHUNK=2**11 # 2048 RATE=44100 # Sampling Rate CHUNK=int(RATE/5) # 0.2 [sec] CHUNK predict_dt = 5*5 # predict data length -> 5 [sec] #predict_dt = 5*4 # predict data length -> 4 [sec] dbl_off = 2 # double check offset #level_th=1000.0 # predict start Sound Level #level_th=2000.0 # predict start Sound Level #level_th=1500.0 # predict start Sound Level level_th=2300.0 # predict start Sound Level p=pyaudio.PyAudio() stream=p.open(format = pyaudio.paInt16, channels = 1, rate = RATE, frames_per_buffer = CHUNK, input = True, output = False) # inputとoutputを同時にTrueにする smaple_f=0 hop_length=128 n_mels=128 if test_name=='test4h': hop_length=256 n_mels=256 while stream.is_active(): input = stream.read(CHUNK) #output = stream.write(input) if smaple_f==0: #print(len(input)) dx = np.frombuffer(input, dtype='int16').astype('float32') #print(dx.shape) #print(dx.dtype) level_max = dx.max() #print('level=',level_max) if level_max < level_th: continue print('level=',level_max) smaple_f=1 data = [] dcnt=0 data.append(input) dcnt+=1 if dcnt >= predict_dt+dbl_off: print('.',end='') data = b''.join(data) #dx = np.frombuffer(data,dtype="int16").astype('float32')/2**15 if test_name=='test4h': dx = np.frombuffer(data,dtype="int16").astype('float32')/2**15 else: dx = np.frombuffer(data,dtype="int16").astype('float16')/2**15 melsp = calculate_melsp1(dx,hop_length=hop_length,n_mels=n_mels) dt=load_melsp(melsp,test=test_id) if dbl_off >0: # (-1,128,1723) -> (-1,128,1723,1) dx_st=dt.shape[2]-dx_mx v = model.predict(dt[:,:,:dx_mx,:]) v2 = model.predict(dt[:,:,dx_st:,:]) i=v.argmax() pred=v.max() s=class_indx[i] print('>',i,':',s,' ',pred) i=v2.argmax() pred=v2.max() s=class_indx[i] print('>',i,':',s,' ',pred) else: v = model.predict(dt) i=v.argmax() pred=v.max() s=class_indx[i] print('>',i,':',s,' ',pred) if False: #plt.figure(figsize=(14, 5)) #plt.figure(figsize=(11, 5)) librosa.display.waveplot(dx, sr=RATE) plt.show() if False: X = librosa.stft(dx) Xdb = librosa.amplitude_to_db(abs(X)) #plt.figure(figsize=(14, 5)) librosa.display.specshow(Xdb, sr=RATE, x_axis='time', y_axis='hz') plt.show() if False: plt.figure(figsize=(14, 5)) librosa.display.specshow(melsp, sr=RATE, x_axis="time", y_axis="mel", hop_length=128) plt.colorbar(format='%+2.0f dB') plt.title("sound-predict3") plt.show() smaple_f=0 data=None stream.stop_stream() stream.start_stream() stream.stop_stream() stream.close() p.terminate()

注) ちなみに、PCのスピーカから、ECS-50　のSound の音を出すのは、
sound-player.py を使います。
Keras CNN Sound Classify #2

感想としては、InceptionV3 の方が、良い結果が得られるみたいです。

カテゴリ:

検索

このブログ記事について

カテゴリ

月別アーカイブ

ウェブページ

サイトナビ