吴恩达Coursera深度学习课程 deeplearning.ai (5-3) 序列模型和注意力机制--编程作业(二):触发字检测

1.数据处理

可参考博客:

https://blog.csdn.net/haoyutiangang/article/details/81231887

https://blog.csdn.net/haoyutiangang/article/details/81231887

2.模型

吴恩达Coursera深度学习课程 deeplearning.ai (5-3) 序列模型和注意力机制--编程作业(二):触发字检测

3.代码实践

#!/usr/bin/env python
# _*_ coding:utf-8 _*_
import numpy as np
from pydub import AudioSegment
import random
import sys
import io
import os
import glob
import IPython
from td_utils import *
from keras.callbacks import ModelCheckpoint
from keras.models import Model, load_model, Sequential
from keras.layers import Dense, Activation, Dropout, Input, Masking, TimeDistributed, LSTM, Conv1D
from keras.layers import GRU, Bidirectional, BatchNormalization, Reshape
from keras.optimizers import Adam
# 1.加载数据集
# # 加载训练集
X=np.load('./XY_train.npy')
Y=np.load('./XY_train.npy')
# # 加载开发集
X_dev = np.load("./XY_dev/X_dev.npy")
Y_dev = np.load("./XY_dev/Y_dev.npy")

# 2.构建模型
def model(input_shape):
    X_input=Input(shape=input_shape)

    # 第一步:卷积层
    X=Conv1D(filters=196,kernel_size=15,strides=4)(X_input)
    X=BatchNormalization()(X)
    X=Activation('relu')(X)
    X=Dropout(0.8)(X)

    # 第二步:第一个GRU层
    X=GRU(units=128,return_sequences=True)(X)
    X=Dropout(0.8)(X)
    X=BatchNormalization()(X)
    X=Dropout(0.8)(X)

    # 第三步:第二个GRU层
    X=GRU(units=128,return_sequences=True)(X)
    X=Dropout(0.8)(X)
    X=BatchNormalization()(X)
    X=Dropout(0.8)(X)

    # 第四步:Time-distributed dense layer
    X=TimeDistributed(Dense(1,activation='sigmoid'))(X)

    model=Model(inputs=X_input,outputs=X)
    return model
model = load_model('./models/tr_model.h5')
# 编译模型
opt = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, decay=0.01)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=["accuracy"])
# 训练模型
model.fit(X, Y, batch_size=5, epochs=1)
# 测试模型
loss, acc = model.evaluate(X_dev, Y_dev)
print("Dev Set accuracy = ", acc)

# 预测模型
def detect_triggerword(filename):
    plt.subplot(2, 1, 1)
    x = graph_spectrogram(filename)
    x = x.swapaxes(0, 1)
    x = np.expand_dims(x, axis=0)
    predictions = model.predict(x)

    plt.subplot(2, 1, 2)
    plt.plot(predictions[0, :, 0])
    plt.ylabel('probability')
    plt.show()

    return predictions
#当检测到“activate”单词时自动播放“chiming”音乐,
# 但是y<t>中会有很多个1来触发这个音乐,
# 而我们只需在检测到第一个“1”时触发,其他时不需要。
# 因此需要chime_on_activate函数来处理。

chime_file = 'audio_examples/chime.wav'


def chime_on_activate(filename, predictions, threshold):
    audio_clip = AudioSegment.from_wave(filename)
    chime = AudioSegment.from_wave(chime_file)
    Ty = predictions.shape[1]

    consecutive_timesteps = 0

    for i in range(Ty):

        consecutive_timesteps += 1
        if predictions[0, i, 0] > threshold and consecutive_timesteps > 75:
            audio_clip = audio_clip.overlay(chime, position=((i / Ty) * audio_clip.duration_senconds) * 1000)
            consecutive_timesteps = 0

    audio_clip.export("chime_output.wav", foramt='wav')

# 在开发集上测试
filename = "./raw_data/dev/1.wav"
prediction = detect_triggerword(filename)
chime_on_activate(filename, prediction, 0.5)
IPython.display.Audio("./chime_out.wav")