1/* 2 * Voxware MetaSound decoder 3 * Copyright (c) 2013 Konstantin Shishkov 4 * based on TwinVQ decoder 5 * Copyright (c) 2009 Vitor Sessak 6 * 7 * This file is part of FFmpeg. 8 * 9 * FFmpeg is free software; you can redistribute it and/or 10 * modify it under the terms of the GNU Lesser General Public 11 * License as published by the Free Software Foundation; either 12 * version 2.1 of the License, or (at your option) any later version. 13 * 14 * FFmpeg is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * Lesser General Public License for more details. 18 * 19 * You should have received a copy of the GNU Lesser General Public 20 * License along with FFmpeg; if not, write to the Free Software 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22 */ 23 24#include <inttypes.h> 25#include <math.h> 26#include <stdint.h> 27 28#define BITSTREAM_READER_LE 29#include "libavutil/channel_layout.h" 30#include "libavutil/float_dsp.h" 31#include "avcodec.h" 32#include "get_bits.h" 33#include "fft.h" 34#include "internal.h" 35#include "lsp.h" 36#include "sinewin.h" 37 38#include "twinvq.h" 39#include "metasound_data.h" 40 41static void add_peak(float period, int width, const float *shape, 42 float ppc_gain, float *speech, int len) 43{ 44 int i, j, center; 45 const float *shape_end = shape + len; 46 47 // First peak centered around zero 48 for (i = 0; i < width / 2; i++) 49 speech[i] += ppc_gain * *shape++; 50 51 for (i = 1; i < ROUNDED_DIV(len, width); i++) { 52 center = (int)(i * period + 0.5); 53 for (j = -width / 2; j < (width + 1) / 2; j++) 54 speech[j + center] += ppc_gain * *shape++; 55 } 56 57 // For the last block, be careful not to go beyond the end of the buffer 58 center = (int)(i * period + 0.5); 59 for (j = -width / 2; j < (width + 1) / 2 && shape < shape_end; j++) 60 speech[j + center] += ppc_gain * *shape++; 61} 62 63static void decode_ppc(TwinVQContext *tctx, int period_coef, int g_coef, 64 const float *shape, float *speech) 65{ 66 const TwinVQModeTab *mtab = tctx->mtab; 67 int isampf = tctx->avctx->sample_rate / 1000; 68 int ibps = tctx->avctx->bit_rate / (1000 * tctx->avctx->channels); 69 int width; 70 71 float ratio = (float)mtab->size / isampf; 72 float min_period, max_period, period_range, period; 73 float some_mult; 74 75 float pgain_base, pgain_step, ppc_gain; 76 77 if (tctx->avctx->channels == 1) { 78 min_period = log2(ratio * 0.2); 79 max_period = min_period + log2(6); 80 } else { 81 min_period = (int)(ratio * 0.2 * 400 + 0.5) / 400.0; 82 max_period = (int)(ratio * 0.2 * 400 * 6 + 0.5) / 400.0; 83 } 84 period_range = max_period - min_period; 85 period = min_period + period_coef * period_range / 86 ((1 << mtab->ppc_period_bit) - 1); 87 if (tctx->avctx->channels == 1) 88 period = powf(2.0, period); 89 else 90 period = (int)(period * 400 + 0.5) / 400.0; 91 92 switch (isampf) { 93 case 8: some_mult = 2.0; break; 94 case 11: some_mult = 3.0; break; 95 case 16: some_mult = 3.0; break; 96 case 22: some_mult = ibps == 32 ? 2.0 : 4.0; break; 97 case 44: some_mult = 8.0; break; 98 default: some_mult = 4.0; 99 } 100 101 width = (int)(some_mult / (mtab->size / period) * mtab->ppc_shape_len); 102 if (isampf == 22 && ibps == 32) 103 width = (int)((2.0 / period + 1) * width + 0.5); 104 105 pgain_base = tctx->avctx->channels == 2 ? 25000.0 : 20000.0; 106 pgain_step = pgain_base / ((1 << mtab->pgain_bit) - 1); 107 ppc_gain = 1.0 / 8192 * 108 twinvq_mulawinv(pgain_step * g_coef + pgain_step / 2, 109 pgain_base, TWINVQ_PGAIN_MU); 110 111 add_peak(period, width, shape, ppc_gain, speech, mtab->ppc_shape_len); 112} 113 114static void dec_bark_env(TwinVQContext *tctx, const uint8_t *in, int use_hist, 115 int ch, float *out, float gain, 116 enum TwinVQFrameType ftype) 117{ 118 const TwinVQModeTab *mtab = tctx->mtab; 119 int i, j; 120 float *hist = tctx->bark_hist[ftype][ch]; 121 float val = ((const float []) { 0.4, 0.35, 0.28 })[ftype]; 122 int bark_n_coef = mtab->fmode[ftype].bark_n_coef; 123 int fw_cb_len = mtab->fmode[ftype].bark_env_size / bark_n_coef; 124 int idx = 0; 125 126 if (tctx->avctx->channels == 1) 127 val = 0.5; 128 for (i = 0; i < fw_cb_len; i++) 129 for (j = 0; j < bark_n_coef; j++, idx++) { 130 float tmp2 = mtab->fmode[ftype].bark_cb[fw_cb_len * in[j] + i] * 131 (1.0 / 2048); 132 float st; 133 134 if (tctx->avctx->channels == 1) 135 st = use_hist ? 136 tmp2 + val * hist[idx] + 1.0 : tmp2 + 1.0; 137 else 138 st = use_hist ? (1.0 - val) * tmp2 + val * hist[idx] + 1.0 139 : tmp2 + 1.0; 140 141 hist[idx] = tmp2; 142 if (st < 0.1) 143 st = 0.1; 144 145 twinvq_memset_float(out, st * gain, 146 mtab->fmode[ftype].bark_tab[idx]); 147 out += mtab->fmode[ftype].bark_tab[idx]; 148 } 149} 150 151static void read_cb_data(TwinVQContext *tctx, GetBitContext *gb, 152 uint8_t *dst, enum TwinVQFrameType ftype) 153{ 154 int i; 155 156 for (i = 0; i < tctx->n_div[ftype]; i++) { 157 int bs_second_part = (i >= tctx->bits_main_spec_change[ftype]); 158 159 *dst++ = get_bits(gb, tctx->bits_main_spec[0][ftype][bs_second_part]); 160 *dst++ = get_bits(gb, tctx->bits_main_spec[1][ftype][bs_second_part]); 161 } 162} 163 164static int metasound_read_bitstream(AVCodecContext *avctx, TwinVQContext *tctx, 165 const uint8_t *buf, int buf_size) 166{ 167 TwinVQFrameData *bits; 168 const TwinVQModeTab *mtab = tctx->mtab; 169 int channels = tctx->avctx->channels; 170 int sub; 171 GetBitContext gb; 172 int i, j, k; 173 174 init_get_bits(&gb, buf, buf_size * 8); 175 176 for (tctx->cur_frame = 0; tctx->cur_frame < tctx->frames_per_packet; 177 tctx->cur_frame++) { 178 bits = tctx->bits + tctx->cur_frame; 179 180 bits->window_type = get_bits(&gb, TWINVQ_WINDOW_TYPE_BITS); 181 182 if (bits->window_type > 8) { 183 av_log(avctx, AV_LOG_ERROR, "Invalid window type, broken sample?\n"); 184 return AVERROR_INVALIDDATA; 185 } 186 187 bits->ftype = ff_twinvq_wtype_to_ftype_table[tctx->bits[tctx->cur_frame].window_type]; 188 189 sub = mtab->fmode[bits->ftype].sub; 190 191 if (bits->ftype != TWINVQ_FT_SHORT && !tctx->is_6kbps) 192 get_bits(&gb, 2); 193 194 read_cb_data(tctx, &gb, bits->main_coeffs, bits->ftype); 195 196 for (i = 0; i < channels; i++) 197 for (j = 0; j < sub; j++) 198 for (k = 0; k < mtab->fmode[bits->ftype].bark_n_coef; k++) 199 bits->bark1[i][j][k] = 200 get_bits(&gb, mtab->fmode[bits->ftype].bark_n_bit); 201 202 for (i = 0; i < channels; i++) 203 for (j = 0; j < sub; j++) 204 bits->bark_use_hist[i][j] = get_bits1(&gb); 205 206 if (bits->ftype == TWINVQ_FT_LONG) { 207 for (i = 0; i < channels; i++) 208 bits->gain_bits[i] = get_bits(&gb, TWINVQ_GAIN_BITS); 209 } else { 210 for (i = 0; i < channels; i++) { 211 bits->gain_bits[i] = get_bits(&gb, TWINVQ_GAIN_BITS); 212 for (j = 0; j < sub; j++) 213 bits->sub_gain_bits[i * sub + j] = 214 get_bits(&gb, TWINVQ_SUB_GAIN_BITS); 215 } 216 } 217 218 for (i = 0; i < channels; i++) { 219 bits->lpc_hist_idx[i] = get_bits(&gb, mtab->lsp_bit0); 220 bits->lpc_idx1[i] = get_bits(&gb, mtab->lsp_bit1); 221 222 for (j = 0; j < mtab->lsp_split; j++) 223 bits->lpc_idx2[i][j] = get_bits(&gb, mtab->lsp_bit2); 224 } 225 226 if (bits->ftype == TWINVQ_FT_LONG) { 227 read_cb_data(tctx, &gb, bits->ppc_coeffs, 3); 228 for (i = 0; i < channels; i++) { 229 bits->p_coef[i] = get_bits(&gb, mtab->ppc_period_bit); 230 bits->g_coef[i] = get_bits(&gb, mtab->pgain_bit); 231 } 232 } 233 234 // subframes are aligned to nibbles 235 if (get_bits_count(&gb) & 3) 236 skip_bits(&gb, 4 - (get_bits_count(&gb) & 3)); 237 } 238 239 return (get_bits_count(&gb) + 7) / 8; 240} 241 242typedef struct MetasoundProps { 243 uint32_t tag; 244 int bit_rate; 245 int channels; 246 int sample_rate; 247} MetasoundProps; 248 249static const MetasoundProps codec_props[] = { 250 { MKTAG('V','X','0','3'), 6, 1, 8000 }, 251 { MKTAG('V','X','0','4'), 12, 2, 8000 }, 252 253 { MKTAG('V','O','X','i'), 8, 1, 8000 }, 254 { MKTAG('V','O','X','j'), 10, 1, 11025 }, 255 { MKTAG('V','O','X','k'), 16, 1, 16000 }, 256 { MKTAG('V','O','X','L'), 24, 1, 22050 }, 257 { MKTAG('V','O','X','q'), 32, 1, 44100 }, 258 { MKTAG('V','O','X','r'), 40, 1, 44100 }, 259 { MKTAG('V','O','X','s'), 48, 1, 44100 }, 260 { MKTAG('V','O','X','t'), 16, 2, 8000 }, 261 { MKTAG('V','O','X','u'), 20, 2, 11025 }, 262 { MKTAG('V','O','X','v'), 32, 2, 16000 }, 263 { MKTAG('V','O','X','w'), 48, 2, 22050 }, 264 { MKTAG('V','O','X','x'), 64, 2, 44100 }, 265 { MKTAG('V','O','X','y'), 80, 2, 44100 }, 266 { MKTAG('V','O','X','z'), 96, 2, 44100 }, 267 268 { 0, 0, 0, 0 } 269}; 270 271static av_cold int metasound_decode_init(AVCodecContext *avctx) 272{ 273 int isampf, ibps; 274 TwinVQContext *tctx = avctx->priv_data; 275 uint32_t tag; 276 const MetasoundProps *props = codec_props; 277 278 if (!avctx->extradata || avctx->extradata_size < 16) { 279 av_log(avctx, AV_LOG_ERROR, "Missing or incomplete extradata\n"); 280 return AVERROR_INVALIDDATA; 281 } 282 283 tag = AV_RL32(avctx->extradata + 12); 284 285 for (;;) { 286 if (!props->tag) { 287 av_log(avctx, AV_LOG_ERROR, "Could not find tag %08"PRIX32"\n", tag); 288 return AVERROR_INVALIDDATA; 289 } 290 if (props->tag == tag) { 291 avctx->sample_rate = props->sample_rate; 292 avctx->channels = props->channels; 293 avctx->bit_rate = props->bit_rate * 1000; 294 isampf = avctx->sample_rate / 1000; 295 break; 296 } 297 props++; 298 } 299 300 if (avctx->channels <= 0 || avctx->channels > TWINVQ_CHANNELS_MAX) { 301 av_log(avctx, AV_LOG_ERROR, "Unsupported number of channels: %i\n", 302 avctx->channels); 303 return AVERROR_INVALIDDATA; 304 } 305 avctx->channel_layout = avctx->channels == 1 ? AV_CH_LAYOUT_MONO 306 : AV_CH_LAYOUT_STEREO; 307 308 ibps = avctx->bit_rate / (1000 * avctx->channels); 309 310 switch ((avctx->channels << 16) + (isampf << 8) + ibps) { 311 case (1 << 16) + ( 8 << 8) + 6: 312 tctx->mtab = &ff_metasound_mode0806; 313 break; 314 case (2 << 16) + ( 8 << 8) + 6: 315 tctx->mtab = &ff_metasound_mode0806s; 316 break; 317 case (1 << 16) + ( 8 << 8) + 8: 318 tctx->mtab = &ff_metasound_mode0808; 319 break; 320 case (2 << 16) + ( 8 << 8) + 8: 321 tctx->mtab = &ff_metasound_mode0808s; 322 break; 323 case (1 << 16) + (11 << 8) + 10: 324 tctx->mtab = &ff_metasound_mode1110; 325 break; 326 case (2 << 16) + (11 << 8) + 10: 327 tctx->mtab = &ff_metasound_mode1110s; 328 break; 329 case (1 << 16) + (16 << 8) + 16: 330 tctx->mtab = &ff_metasound_mode1616; 331 break; 332 case (2 << 16) + (16 << 8) + 16: 333 tctx->mtab = &ff_metasound_mode1616s; 334 break; 335 case (1 << 16) + (22 << 8) + 24: 336 tctx->mtab = &ff_metasound_mode2224; 337 break; 338 case (2 << 16) + (22 << 8) + 24: 339 tctx->mtab = &ff_metasound_mode2224s; 340 break; 341 case (1 << 16) + (44 << 8) + 32: 342 tctx->mtab = &ff_metasound_mode4432; 343 break; 344 case (2 << 16) + (44 << 8) + 32: 345 tctx->mtab = &ff_metasound_mode4432s; 346 break; 347 case (1 << 16) + (44 << 8) + 40: 348 tctx->mtab = &ff_metasound_mode4440; 349 break; 350 case (2 << 16) + (44 << 8) + 40: 351 tctx->mtab = &ff_metasound_mode4440s; 352 break; 353 case (1 << 16) + (44 << 8) + 48: 354 tctx->mtab = &ff_metasound_mode4448; 355 break; 356 case (2 << 16) + (44 << 8) + 48: 357 tctx->mtab = &ff_metasound_mode4448s; 358 break; 359 default: 360 av_log(avctx, AV_LOG_ERROR, 361 "This version does not support %d kHz - %d kbit/s/ch mode.\n", 362 isampf, ibps); 363 return AVERROR(ENOSYS); 364 } 365 366 tctx->codec = TWINVQ_CODEC_METASOUND; 367 tctx->read_bitstream = metasound_read_bitstream; 368 tctx->dec_bark_env = dec_bark_env; 369 tctx->decode_ppc = decode_ppc; 370 tctx->frame_size = avctx->bit_rate * tctx->mtab->size 371 / avctx->sample_rate; 372 tctx->is_6kbps = ibps == 6; 373 374 return ff_twinvq_decode_init(avctx); 375} 376 377AVCodec ff_metasound_decoder = { 378 .name = "metasound", 379 .long_name = NULL_IF_CONFIG_SMALL("Voxware MetaSound"), 380 .type = AVMEDIA_TYPE_AUDIO, 381 .id = AV_CODEC_ID_METASOUND, 382 .priv_data_size = sizeof(TwinVQContext), 383 .init = metasound_decode_init, 384 .close = ff_twinvq_decode_close, 385 .decode = ff_twinvq_decode_frame, 386 .capabilities = CODEC_CAP_DR1, 387 .sample_fmts = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP, 388 AV_SAMPLE_FMT_NONE }, 389}; 390