-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathsoundnet.py
105 lines (79 loc) · 3.98 KB
/
soundnet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from keras.layers import BatchNormalization, Activation, Conv1D, MaxPooling1D, ZeroPadding1D, InputLayer
from keras.models import Sequential
import numpy as np
import librosa
def preprocess(audio):
audio *= 256.0 # SoundNet needs the range to be between -256 and 256
# reshaping the audio data so it fits into the graph (batch_size, num_samples, num_filter_channels)
audio = np.reshape(audio, (1, -1, 1))
return audio
def load_audio(audio_file):
sample_rate = 22050 # SoundNet works on mono audio files with a sample rate of 22050.
audio, sr = librosa.load(audio_file, dtype='float32', sr=22050, mono=True)
audio = preprocess(audio)
return audio
def build_model():
"""
Builds up the SoundNet model and loads the weights from a given model file (8-layer model is kept at models/sound8.npy).
:return:
"""
model_weights = np.load('models/sound8.npy').item()
model = Sequential()
model.add(InputLayer(batch_input_shape=(1, None, 1)))
filter_parameters = [{'name': 'conv1', 'num_filters': 16, 'padding': 32,
'kernel_size': 64, 'conv_strides': 2,
'pool_size': 8, 'pool_strides': 8},
{'name': 'conv2', 'num_filters': 32, 'padding': 16,
'kernel_size': 32, 'conv_strides': 2,
'pool_size': 8, 'pool_strides': 8},
{'name': 'conv3', 'num_filters': 64, 'padding': 8,
'kernel_size': 16, 'conv_strides': 2},
{'name': 'conv4', 'num_filters': 128, 'padding': 4,
'kernel_size': 8, 'conv_strides': 2},
{'name': 'conv5', 'num_filters': 256, 'padding': 2,
'kernel_size': 4, 'conv_strides': 2,
'pool_size': 4, 'pool_strides': 4},
{'name': 'conv6', 'num_filters': 512, 'padding': 2,
'kernel_size': 4, 'conv_strides': 2},
{'name': 'conv7', 'num_filters': 1024, 'padding': 2,
'kernel_size': 4, 'conv_strides': 2},
{'name': 'conv8_2', 'num_filters': 401, 'padding': 0,
'kernel_size': 8, 'conv_strides': 2},
]
for x in filter_parameters:
model.add(ZeroPadding1D(padding=x['padding']))
model.add(Conv1D(x['num_filters'],
kernel_size=x['kernel_size'],
strides=x['conv_strides'],
padding='valid'))
weights = model_weights[x['name']]['weights'].reshape(model.layers[-1].get_weights()[0].shape)
biases = model_weights[x['name']]['biases']
model.layers[-1].set_weights([weights, biases])
if 'conv8' not in x['name']:
gamma = model_weights[x['name']]['gamma']
beta = model_weights[x['name']]['beta']
mean = model_weights[x['name']]['mean']
var = model_weights[x['name']]['var']
model.add(BatchNormalization())
model.layers[-1].set_weights([gamma, beta, mean, var])
model.add(Activation('relu'))
if 'pool_size' in x:
model.add(MaxPooling1D(pool_size=x['pool_size'],
strides=x['pool_strides'],
padding='valid'))
return model
def predict_scene_from_audio_file(audio_file):
model = build_model()
audio = load_audio(audio_file)
return model.predict(audio)
def predictions_to_scenes(prediction):
scenes = []
with open('categories/categories_places2.txt', 'r') as f:
categories = f.read().split('\n')
for p in range(prediction.shape[1]):
scenes.append(categories[np.argmax(prediction[0, p, :])])
return scenes
if __name__ == '__main__':
# SoundNet demonstration
prediction = predict_scene_from_audio_file('railroad_audio.wav')
print predictions_to_scenes(prediction)