-
Notifications
You must be signed in to change notification settings - Fork 1
/
vadc.h
193 lines (147 loc) · 5.16 KB
/
vadc.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#pragma once
#include "utils.h"
#include "memory.h"
#include "string8.h"
#if !defined(ONNX_INFERENCE_ENABLED)
#define ONNX_INFERENCE_ENABLED 1
#endif // ONNX_INFERENCE_ENABLED
typedef struct Silero_Config Silero_Config;
struct Silero_Config
{
// NOTE(irwin): sample rate input, implies fused model
s32 sr_input_index;
s32 batch_size_restriction;
s32 batch_size;
// NOTE(irwin): v5 only, 32 or 64
s32 context_size;
// NOTE(irwin): sequence count, does not include context_size
s32 input_count;
size_t prob_shape_count;
int64_t prob_shape[4];
size_t prob_tensor_element_count;
s32 output_dims;
s32 silero_probability_out_index;
s32 output_stride;
// s32 batch_size_restriction;
s32 input_size_min;
s32 input_size_max;
// s32 sr_input_index;
// s32 output_dims;
s32 lstm_hidden_size;
b32 is_silero_v5;
};
typedef struct Tensor_Buffers Tensor_Buffers;
struct Tensor_Buffers
{
int window_size_samples;
float *input_samples;
float *output;
int lstm_count;
float *lstm_h;
float *lstm_c;
float *lstm_h_out;
float *lstm_c_out;
};
typedef struct VADC_Context VADC_Context;
// NOTE(irwin): forward declare
typedef struct Silero_Context Silero_Context;
struct VADC_Context
{
void *backend;
Tensor_Buffers buffers;
};
#if ONNX_INFERENCE_ENABLED
#include "onnx_helpers.h"
#endif // ONNX_INFERENCE_ENABLED
#include <stdio.h>
#define SILERO_FILENAME_V3_B_DYNAMIC L"silero_restored_v3.1_16k_v3_dyn.onnx"
#define SILERO_FILENAME_V4 L"silero_vad_v4.onnx"
#define SILERO_FILENAME_V5 L"silero_vad_v5_16k_minibatched-sim.onnx"
// #define SILERO_FILENAME SILERO_FILENAME_V4
#define SILERO_FILENAME SILERO_FILENAME_V5
#define SILERO_SLICE_SAMPLES_8K 128
#define SILERO_SLICE_SAMPLES_16K 256
#define SILERO_SLICE_COUNT_MIN 2
#define SILERO_SLICE_COUNT_MAX 6
#define SILERO_V5_CONTEXT_SIZE 64
#define SILERO_SLICE_COUNT 2
// 512, 768, 1024, 1280, 1536
// #define SILERO_WINDOW_SIZE_SAMPLES (SILERO_SLICE_SAMPLES_16K * SILERO_SLICE_COUNT)
#define SILERO_SAMPLE_RATE 16000
// const size_t HARDCODED_WINDOW_SIZE_SAMPLES = SILERO_WINDOW_SIZE_SAMPLES;
const size_t HARDCODED_SAMPLE_RATE = SILERO_SAMPLE_RATE;
#undef SILERO_WINDOW_SIZE_SAMPLES
#undef SILERO_SAMPLE_RATE
#undef SILERO_SLICE_COUNT_MIN
#undef SILERO_SLICE_COUNT
#undef SILERO_SLICE_SAMPLES_8K
#undef SILERO_SLICE_SAMPLES_16K
typedef struct FeedState
{
int temp_end;
int current_speech_start;
b32 triggered;
} FeedState;
typedef struct FeedProbabilityResult
{
int speech_start;
int speech_end;
b32 is_valid;
} FeedProbabilityResult;
typedef struct VADC_Stats VADC_Stats;
struct VADC_Stats
{
s64 first_call_timestamp;
// s64 last_call_timestamp;
s64 timer_frequency;
double total_speech;
double total_duration;
s64 total_samples;
b32 output_enabled;
};
typedef enum Segment_Output_Format Segment_Output_Format;
enum Segment_Output_Format
{
Segment_Output_Format_Seconds = 0,
Segment_Output_Format_CentiSeconds, // NOTE(irwin): hundredths of seconds, 500 -> 5 seconds
Segment_Output_Format_COUNT
};
int run_inference( String8 model_path_arg,
MemoryArena *arena,
float min_silence_duration_ms,
float min_speech_duration_ms,
float threshold,
float neg_threshold,
float speech_pad_ms,
float desired_sequence_count,
b32 raw_probabilities,
Segment_Output_Format output_format,
String8 filename,
b32 stats_output_enabled,
s32 preferred_batch_size,
int audio_source,
float start_seconds );
void process_chunks( MemoryArena *arena, VADC_Context context, Silero_Config config,
const size_t buffered_samples_count,
const float *samples_buffer_float32,
float *probabilities_buffer );
FeedProbabilityResult feed_probability( FeedState *state,
int min_silence_duration_chunks,
int min_speech_duration_chunks,
float probability,
float threshold,
float neg_threshold,
int global_chunk_index );
void emit_speech_segment( FeedProbabilityResult segment,
float speech_pad_ms,
Segment_Output_Format output_format,
VADC_Stats *stats,
float seconds_per_chunk );
FeedProbabilityResult combine_or_emit_speech_segment( FeedProbabilityResult buffered,
FeedProbabilityResult feed_result,
float speech_pad_ms,
Segment_Output_Format output_format,
VADC_Stats *stats,
float seconds_per_chunk );
// NOTE(irwin): onnx helper routines
static inline void print_speech_stats(VADC_Stats stats);