|
| 1 | + |
| 2 | + |
| 3 | +""" |
| 4 | +
|
| 5 | +Inspired by CRNN model described in |
| 6 | +
|
| 7 | +Sound Event Detection: A Tutorial |
| 8 | +https://arxiv.org/abs/2107.05463 |
| 9 | +
|
| 10 | +and |
| 11 | +
|
| 12 | +Convolutional Recurrent Neural Networks for Polyphonic Sound Event Detection |
| 13 | +https://arxiv.org/abs/1702.06286 |
| 14 | +""" |
| 15 | + |
| 16 | +# related code, https://chadrick-kwag.net/tf-keras-rnn-ctc-example/ |
| 17 | + |
| 18 | +def build_model(frames=128, bands=40, channels=1, n_classes=10, |
| 19 | + conv_size=(3,3), |
| 20 | + conv_block='conv', |
| 21 | + downsample_size=(2,2), |
| 22 | + n_stages=3, n_blocks_per_stage=1, |
| 23 | + filters=128, kernels_growth=1.0, |
| 24 | + fully_connected=64, |
| 25 | + rnn_units=32, |
| 26 | + temporal='bigru', |
| 27 | + dropout=0.5, l2=0.001, backend='detection'): |
| 28 | + |
| 29 | + |
| 30 | + from tensorflow.keras import Model, Sequential |
| 31 | + from tensorflow.keras.layers import \ |
| 32 | + Conv2D, LSTM, GRU, Bidirectional, MaxPooling2D, \ |
| 33 | + Reshape, TimeDistributed, Softmax, Dense, SeparableConv2D |
| 34 | + |
| 35 | + model = Sequential() |
| 36 | + |
| 37 | + input_shape = (frames, bands, channels) |
| 38 | + |
| 39 | + def add_conv_block(model, downsample_size, conv_filters=filters, kernel_size=conv_size, |
| 40 | + **kwargs): |
| 41 | + model.add(SeparableConv2D(conv_filters, conv_size, **kwargs)) |
| 42 | + model.add(MaxPooling2D(downsample_size)) |
| 43 | + |
| 44 | + # TODO: add ReLu |
| 45 | + # TODO: BatchNorm etc? |
| 46 | + |
| 47 | + # Convolutional layers |
| 48 | + add_conv_block(model, downsample_size=(1,5), input_shape=input_shape) |
| 49 | + add_conv_block(model, downsample_size=(1,2)) |
| 50 | + add_conv_block(model, downsample_size=(1,2)) |
| 51 | + |
| 52 | + # Temporal processing |
| 53 | + if temporal == 'bigru': |
| 54 | + o = model.layers[-1].output_shape |
| 55 | + model.add(Reshape((o[1], -1))) |
| 56 | + model.add(Bidirectional(GRU(rnn_units, return_sequences=True))) |
| 57 | + model.add(Bidirectional(GRU(rnn_units, return_sequences=True))) |
| 58 | + elif temporal == 'tcn': |
| 59 | + # TODO: make downsampling adjustable |
| 60 | + model.add(SeparableConv2D(rnn_units, (9, 1), strides=(2,1))) |
| 61 | + model.add(SeparableConv2D(rnn_units, (9, 1), strides=(2,1))) |
| 62 | + else: |
| 63 | + raise ValueError(f"Unknown temporal parameter {temporal}") |
| 64 | + |
| 65 | + # Output |
| 66 | + # TODO: support multiple layers |
| 67 | + # TODO: add Dropout |
| 68 | + o = model.layers[-1].output_shape |
| 69 | + if backend == 'classification': |
| 70 | + model.add(TimeDistributed(Dense(fully_connected, activation="linear"))) |
| 71 | + model.add(layers.Dense(n_classes)) |
| 72 | + model.add(Softmax()) |
| 73 | + |
| 74 | + elif backend == 'detection': |
| 75 | + #model.add(TimeDistributed(Dense(fully_connected, activation="linear"))) |
| 76 | + model.add(TimeDistributed(Dense(n_classes, activation="linear"), input_shape=(o[1], o[2]))) |
| 77 | + model.add(Softmax()) |
| 78 | + elif not backend: |
| 79 | + pass # no backend |
| 80 | + else: |
| 81 | + raise ValueError(f"Unsupported backend '{backend}'") |
| 82 | + |
| 83 | + return model |
| 84 | + |
| 85 | + |
| 86 | +def test_model(): |
| 87 | + |
| 88 | + model = build_model(filters=24, bands=64, rnn_units=16, n_classes=3, temporal='tcn') |
| 89 | + |
| 90 | + print(model.summary()) |
| 91 | + |
| 92 | + |
| 93 | +if __name__ == '__main__': |
| 94 | + test_model() |
| 95 | + |
0 commit comments