Keras ASR with CTC を試す。

Keras ASR with CTC を試してみた。
Automatic Speech Recognition using CTC

当初は、Automatic Speech Recognition with Transformer を試してみたけれど、
余り、loss、val_loss が改善しないので、こちらにしてみました。
注) 但し、こちらの方が、学習させるのは、軽いみたいなので、もう少し性能が良ければ...

環境:
Windows11
Python 3.10.6
tensorflow-gpu 2.10.0
GTX-1070
cuda toolkit 11.2
cuDNN SDK 8.1.0

注) GTX-1070 だと、1 epoch 20分程かかる。
GeForce RTX 2080 Ti GPU だと、5-6 分だとさ。

取り敢えず、自分で手直して試してみました。
オリジナルの儘だと、処理の継続ができましぇん。

''' Automatic Speech Recognition using CTC https://keras.io/examples/audio/ctc_asr/ ''' import tensorflow as tf #try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")] #except: pass import os import pandas as pd import numpy as np from tensorflow import keras from tensorflow.keras import layers import matplotlib.pyplot as plt from IPython import display from jiwer import wer from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard,LearningRateScheduler import sys # An integer scalar Tensor. The window length in samples. frame_length = 256 # An integer scalar Tensor. The number of samples to step. frame_step = 160 # An integer scalar Tensor. The size of the FFT to apply. # If not provided, uses the smallest power of 2 enclosing frame_length. fft_length = 384 def encode_single_sample(wav_file, label): ########################################### ## Process the Audio ########################################## # 1. Read wav file file = tf.io.read_file(wavs_path + wav_file + ".wav") # 2. Decode the wav file audio, _ = tf.audio.decode_wav(file) audio = tf.squeeze(audio, axis=-1) # 3. Change type to float audio = tf.cast(audio, tf.float32) # 4. Get the spectrogram spectrogram = tf.signal.stft( audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length ) # 5. We only need the magnitude, which can be derived by applying tf.abs spectrogram = tf.abs(spectrogram) spectrogram = tf.math.pow(spectrogram, 0.5) # 6. normalisation means = tf.math.reduce_mean(spectrogram, 1, keepdims=True) stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True) spectrogram = (spectrogram - means) / (stddevs + 1e-10) ########################################### ## Process the label ########################################## # 7. Convert label to Lower case label = tf.strings.lower(label) # 8. Split the label label = tf.strings.unicode_split(label, input_encoding="UTF-8") # 9. Map the characters in label to numbers label = char_to_num(label) # 10. Return a dict as our model is expecting two inputs return spectrogram, label ''' Model We first define the CTC Loss function. ''' def CTCLoss(y_true, y_pred): # Compute the training-time loss value batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64") input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64") label_length = tf.cast(tf.shape(y_true)[1], dtype="int64") input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64") label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64") loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length) return loss def build_model(input_dim, output_dim, rnn_layers=5, rnn_units=128): """Model similar to DeepSpeech2.""" # Model's input input_spectrogram = layers.Input((None, input_dim), name="input") # Expand the dimension to use 2D CNN. x = layers.Reshape((-1, input_dim, 1), name="expand_dim")(input_spectrogram) # Convolution layer 1 x = layers.Conv2D( filters=32, kernel_size=[11, 41], strides=[2, 2], padding="same", use_bias=False, name="conv_1", )(x) x = layers.BatchNormalization(name="conv_1_bn")(x) x = layers.ReLU(name="conv_1_relu")(x) # Convolution layer 2 x = layers.Conv2D( filters=32, kernel_size=[11, 21], strides=[1, 2], padding="same", use_bias=False, name="conv_2", )(x) x = layers.BatchNormalization(name="conv_2_bn")(x) x = layers.ReLU(name="conv_2_relu")(x) # Reshape the resulted volume to feed the RNNs layers x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x) # RNN layers for i in range(1, rnn_layers + 1): recurrent = layers.GRU( units=rnn_units, activation="tanh", recurrent_activation="sigmoid", use_bias=True, return_sequences=True, reset_after=True, name=f"gru_{i}", ) x = layers.Bidirectional( recurrent, name=f"bidirectional_{i}", merge_mode="concat" )(x) if i < rnn_layers: x = layers.Dropout(rate=0.5)(x) # Dense layer x = layers.Dense(units=rnn_units * 2, name="dense_1")(x) x = layers.ReLU(name="dense_1_relu")(x) x = layers.Dropout(rate=0.5)(x) # Classification layer output = layers.Dense(units=output_dim + 1, activation="softmax")(x) # Model model = keras.Model(input_spectrogram, output, name="DeepSpeech_2") # Optimizer #opt = keras.optimizers.Adam(learning_rate=1e-4) # Compile the model and return #model.compile(optimizer=opt, loss=CTCLoss) return model # A utility function to decode the output of the network def decode_batch_predictions(pred): input_len = np.ones(pred.shape[0]) * pred.shape[1] # Use greedy search. For complex tasks, you can use beam search results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0] # Iterate over the results and get back the text output_text = [] for result in results: result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8") output_text.append(result) return output_text # A callback class to output a few transcriptions during training class CallbackEval(keras.callbacks.Callback): """Displays a batch of outputs after every epoch.""" def __init__(self, dataset): super().__init__() self.dataset = dataset def on_epoch_end(self, epoch: int, logs=None): predictions = [] targets = [] for batch in self.dataset: X, y = batch batch_predictions = model.predict(X) batch_predictions = decode_batch_predictions(batch_predictions) predictions.extend(batch_predictions) for label in y: label = ( tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8") ) targets.append(label) wer_score = wer(targets, predictions) print("-" * 100) print(f"Word Error Rate: {wer_score:.4f}") print("-" * 100) for i in np.random.randint(0, len(predictions), 2): print(f"Target : {targets[i]}") print(f"Prediction: {predictions[i]}") print("-" * 100) #----------------- # main start #----------------- if __name__ == "__main__": CONT_F=False initial_epoch=0 # start 0 checkpoint_dir='training-save0' if False: ''' Load the LJSpeech Dataset ''' data_url = "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2" data_path = keras.utils.get_file("LJSpeech-1.1", data_url, untar=True) # lstm_sound_to_text のデータを使う。 #saveto = "./datasets/LJSpeech-1.1" data_path = "../lstm_sound_to_text/Datasets/LJSpeech-1.1" wavs_path = data_path + "/wavs/" metadata_path = data_path + "/metadata.csv" # Read metadata file and parse it metadata_df = pd.read_csv(metadata_path, sep="|", header=None, quoting=3) metadata_df.columns = ["file_name", "transcription", "normalized_transcription"] metadata_df = metadata_df[["file_name", "normalized_transcription"]] metadata_df = metadata_df.sample(frac=1).reset_index(drop=True) #print(metadata_df.head(3)) # train_data, test_data をファイルに保存する # https://stackoverflow.com/questions/27745500/how-to-save-a-list-to-a-file-and-read-it-as-a-list-type # https://note.nkmk.me/python-pandas-to-pickle-read-pickle/ import pickle if CONT_F== False: split = int(len(metadata_df) * 0.90) df_train = metadata_df[:split] df_val = metadata_df[split:] print('type(df_train)',type(df_train)) df_train.to_pickle('data/df_train.pkl') df_val.to_pickle('data/df_val.pkl') else: df_train = pd.read_pickle('data/df_train.pkl') df_val = pd.read_pickle('data/df_val.pkl') print(df_train.head(3)) print(f"Size of the training set: {len(df_train)}") print(f"Size of the training set: {len(df_val)}") #sys.exit() ''' Preprocessing ''' # The set of characters accepted in the transcription. characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "] # Mapping characters to integers char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="") # Mapping integers back to original characters num_to_char = keras.layers.StringLookup( vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True ) print( f"The vocabulary is: {char_to_num.get_vocabulary()} " f"(size ={char_to_num.vocabulary_size()})" ) # Next, we create the function that describes the transformation that we apply to each element of our dataset. # Creating Dataset objects batch_size = 32 # Define the training dataset train_dataset = tf.data.Dataset.from_tensor_slices( (list(df_train["file_name"]), list(df_train["normalized_transcription"])) ) train_dataset = ( train_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE) .padded_batch(batch_size) .prefetch(buffer_size=tf.data.AUTOTUNE) ) # Define the validation dataset validation_dataset = tf.data.Dataset.from_tensor_slices( (list(df_val["file_name"]), list(df_val["normalized_transcription"])) ) validation_dataset = ( validation_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE) .padded_batch(batch_size) .prefetch(buffer_size=tf.data.AUTOTUNE) ) # Include the epoch in the file name (uses `str.format`) checkpoint_path = checkpoint_dir+"/cp-{epoch:04d}.ckpt" #checkpoint_path = os.path.join("Models/05_sound_to_text", datetime.strftime(datetime.now(), "%Y%m%d%H%M")) #checkpoint_dir = os.path.dirname(checkpoint_path) # Create a callback that saves the model's weights every 5 epochs cp_callback = ModelCheckpoint( filepath=checkpoint_path, monitor="loss", #monitor="CER", #monitor="val_CER", verbose=1, save_best_only=True, save_weights_only=True, #save_freq=20*batch_size, mode="min") # tensorboard # https://teratail.com/questions/97nyrumr5iix6d tb_callback = TensorBoard(checkpoint_dir+"/logs", update_freq=1) if CONT_F == True: latest = tf.train.latest_checkpoint(checkpoint_dir) basename_without_ext = os.path.splitext(os.path.basename(latest))[0] initial_epoch=int(basename_without_ext.split('-')[1]) print('initial_epoch:',initial_epoch) if False: # Visualize the data fig = plt.figure(figsize=(8, 5)) for batch in train_dataset.take(1): spectrogram = batch[0][0].numpy() spectrogram = np.array([np.trim_zeros(x) for x in np.transpose(spectrogram)]) label = batch[1][0] # Spectrogram label = tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8") ax = plt.subplot(2, 1, 1) ax.imshow(spectrogram, vmax=1) ax.set_title(label) ax.axis("off") # Wav file = tf.io.read_file(wavs_path + list(df_train["file_name"])[0] + ".wav") audio, _ = tf.audio.decode_wav(file) audio = audio.numpy() ax = plt.subplot(2, 1, 2) plt.plot(audio) ax.set_title("Signal Wave") ax.set_xlim(0, len(audio)) display.display(display.Audio(np.transpose(audio), rate=16000)) plt.show() # Get the model model = build_model( input_dim=fft_length // 2 + 1, output_dim=char_to_num.vocabulary_size(), rnn_units=512, ) # Optimizer opt = keras.optimizers.Adam(learning_rate=1e-4) # Compile the model and return model.compile(optimizer=opt, loss=CTCLoss) if CONT_F == True: latest = tf.train.latest_checkpoint(checkpoint_dir) #Model_dir=os.path.join('Models','05_sound_to_text','202306180353','model.h5') print(latest) # training_2\cp-0002.ckpt model.load_weights(latest) #model.summary(line_length=110) ''' Training and Evaluating ''' # Define the number of epochs. # Callback function to check transcription on the val set. validation_callback = CallbackEval(validation_dataset) #sys.exit() epoch_num=100 # Train the model history = model.fit( train_dataset, validation_data=validation_dataset, epochs=initial_epoch+epoch_num, initial_epoch=initial_epoch, callbacks=[validation_callback, cp_callback, tb_callback], ) print(history.history.keys()) #dict_keys(['loss', 'val_loss']) #sys.exit() ''' Inference ''' # Let's check results on more validation samples predictions = [] targets = [] for batch in validation_dataset: X, y = batch batch_predictions = model.predict(X) batch_predictions = decode_batch_predictions(batch_predictions) predictions.extend(batch_predictions) for label in y: label = tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8") targets.append(label) wer_score = wer(targets, predictions) print("-" * 100) print(f"Word Error Rate: {wer_score:.4f}") print("-" * 100) for i in np.random.randint(0, len(predictions), 5): print(f"Target : {targets[i]}") print(f"Prediction: {predictions[i]}") print("-" * 100)

ここまで来たか、
Web会議にNottaを起動しておくことで、リアルタイムで会議内容を文字に起します。

細かくノートを取る必要がないので、会議内容や議論に思う存分集中することができるそうな。
議事録作成の時間短縮が見込まれるAI自動文字起こしサービスを一度試してみては、どうぞね！
議事録作成の手間を大幅に軽減【Notta】
(広告)

カテゴリ:

検索

このブログ記事について

カテゴリ

月別アーカイブ

ウェブページ

サイトナビ