1/* 2 * Copyright (c) 2012 Pavel Koshevoy <pkoshevoy at gmail dot com> 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21/** 22 * @file 23 * tempo scaling audio filter -- an implementation of WSOLA algorithm 24 * 25 * Based on MIT licensed yaeAudioTempoFilter.h and yaeAudioFragment.h 26 * from Apprentice Video player by Pavel Koshevoy. 27 * https://sourceforge.net/projects/apprenticevideo/ 28 * 29 * An explanation of SOLA algorithm is available at 30 * http://www.surina.net/article/time-and-pitch-scaling.html 31 * 32 * WSOLA is very similar to SOLA, only one major difference exists between 33 * these algorithms. SOLA shifts audio fragments along the output stream, 34 * where as WSOLA shifts audio fragments along the input stream. 35 * 36 * The advantage of WSOLA algorithm is that the overlap region size is 37 * always the same, therefore the blending function is constant and 38 * can be precomputed. 39 */ 40 41#include <float.h> 42#include "libavcodec/avfft.h" 43#include "libavutil/avassert.h" 44#include "libavutil/avstring.h" 45#include "libavutil/channel_layout.h" 46#include "libavutil/eval.h" 47#include "libavutil/opt.h" 48#include "libavutil/samplefmt.h" 49#include "avfilter.h" 50#include "audio.h" 51#include "internal.h" 52 53/** 54 * A fragment of audio waveform 55 */ 56typedef struct { 57 // index of the first sample of this fragment in the overall waveform; 58 // 0: input sample position 59 // 1: output sample position 60 int64_t position[2]; 61 62 // original packed multi-channel samples: 63 uint8_t *data; 64 65 // number of samples in this fragment: 66 int nsamples; 67 68 // rDFT transform of the down-mixed mono fragment, used for 69 // fast waveform alignment via correlation in frequency domain: 70 FFTSample *xdat; 71} AudioFragment; 72 73/** 74 * Filter state machine states 75 */ 76typedef enum { 77 YAE_LOAD_FRAGMENT, 78 YAE_ADJUST_POSITION, 79 YAE_RELOAD_FRAGMENT, 80 YAE_OUTPUT_OVERLAP_ADD, 81 YAE_FLUSH_OUTPUT, 82} FilterState; 83 84/** 85 * Filter state machine 86 */ 87typedef struct { 88 const AVClass *class; 89 90 // ring-buffer of input samples, necessary because some times 91 // input fragment position may be adjusted backwards: 92 uint8_t *buffer; 93 94 // ring-buffer maximum capacity, expressed in sample rate time base: 95 int ring; 96 97 // ring-buffer house keeping: 98 int size; 99 int head; 100 int tail; 101 102 // 0: input sample position corresponding to the ring buffer tail 103 // 1: output sample position 104 int64_t position[2]; 105 106 // sample format: 107 enum AVSampleFormat format; 108 109 // number of channels: 110 int channels; 111 112 // row of bytes to skip from one sample to next, across multple channels; 113 // stride = (number-of-channels * bits-per-sample-per-channel) / 8 114 int stride; 115 116 // fragment window size, power-of-two integer: 117 int window; 118 119 // Hann window coefficients, for feathering 120 // (blending) the overlapping fragment region: 121 float *hann; 122 123 // tempo scaling factor: 124 double tempo; 125 126 // a snapshot of previous fragment input and output position values 127 // captured when the tempo scale factor was set most recently: 128 int64_t origin[2]; 129 130 // current/previous fragment ring-buffer: 131 AudioFragment frag[2]; 132 133 // current fragment index: 134 uint64_t nfrag; 135 136 // current state: 137 FilterState state; 138 139 // for fast correlation calculation in frequency domain: 140 RDFTContext *real_to_complex; 141 RDFTContext *complex_to_real; 142 FFTSample *correlation; 143 144 // for managing AVFilterPad.request_frame and AVFilterPad.filter_frame 145 AVFrame *dst_buffer; 146 uint8_t *dst; 147 uint8_t *dst_end; 148 uint64_t nsamples_in; 149 uint64_t nsamples_out; 150} ATempoContext; 151 152#define OFFSET(x) offsetof(ATempoContext, x) 153 154static const AVOption atempo_options[] = { 155 { "tempo", "set tempo scale factor", 156 OFFSET(tempo), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 0.5, 2.0, 157 AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_FILTERING_PARAM }, 158 { NULL } 159}; 160 161AVFILTER_DEFINE_CLASS(atempo); 162 163inline static AudioFragment *yae_curr_frag(ATempoContext *atempo) 164{ 165 return &atempo->frag[atempo->nfrag % 2]; 166} 167 168inline static AudioFragment *yae_prev_frag(ATempoContext *atempo) 169{ 170 return &atempo->frag[(atempo->nfrag + 1) % 2]; 171} 172 173/** 174 * Reset filter to initial state, do not deallocate existing local buffers. 175 */ 176static void yae_clear(ATempoContext *atempo) 177{ 178 atempo->size = 0; 179 atempo->head = 0; 180 atempo->tail = 0; 181 182 atempo->nfrag = 0; 183 atempo->state = YAE_LOAD_FRAGMENT; 184 185 atempo->position[0] = 0; 186 atempo->position[1] = 0; 187 188 atempo->origin[0] = 0; 189 atempo->origin[1] = 0; 190 191 atempo->frag[0].position[0] = 0; 192 atempo->frag[0].position[1] = 0; 193 atempo->frag[0].nsamples = 0; 194 195 atempo->frag[1].position[0] = 0; 196 atempo->frag[1].position[1] = 0; 197 atempo->frag[1].nsamples = 0; 198 199 // shift left position of 1st fragment by half a window 200 // so that no re-normalization would be required for 201 // the left half of the 1st fragment: 202 atempo->frag[0].position[0] = -(int64_t)(atempo->window / 2); 203 atempo->frag[0].position[1] = -(int64_t)(atempo->window / 2); 204 205 av_frame_free(&atempo->dst_buffer); 206 atempo->dst = NULL; 207 atempo->dst_end = NULL; 208 209 atempo->nsamples_in = 0; 210 atempo->nsamples_out = 0; 211} 212 213/** 214 * Reset filter to initial state and deallocate all buffers. 215 */ 216static void yae_release_buffers(ATempoContext *atempo) 217{ 218 yae_clear(atempo); 219 220 av_freep(&atempo->frag[0].data); 221 av_freep(&atempo->frag[1].data); 222 av_freep(&atempo->frag[0].xdat); 223 av_freep(&atempo->frag[1].xdat); 224 225 av_freep(&atempo->buffer); 226 av_freep(&atempo->hann); 227 av_freep(&atempo->correlation); 228 229 av_rdft_end(atempo->real_to_complex); 230 atempo->real_to_complex = NULL; 231 232 av_rdft_end(atempo->complex_to_real); 233 atempo->complex_to_real = NULL; 234} 235 236/* av_realloc is not aligned enough; fortunately, the data does not need to 237 * be preserved */ 238#define RE_MALLOC_OR_FAIL(field, field_size) \ 239 do { \ 240 av_freep(&field); \ 241 field = av_malloc(field_size); \ 242 if (!field) { \ 243 yae_release_buffers(atempo); \ 244 return AVERROR(ENOMEM); \ 245 } \ 246 } while (0) 247 248/** 249 * Prepare filter for processing audio data of given format, 250 * sample rate and number of channels. 251 */ 252static int yae_reset(ATempoContext *atempo, 253 enum AVSampleFormat format, 254 int sample_rate, 255 int channels) 256{ 257 const int sample_size = av_get_bytes_per_sample(format); 258 uint32_t nlevels = 0; 259 uint32_t pot; 260 int i; 261 262 atempo->format = format; 263 atempo->channels = channels; 264 atempo->stride = sample_size * channels; 265 266 // pick a segment window size: 267 atempo->window = sample_rate / 24; 268 269 // adjust window size to be a power-of-two integer: 270 nlevels = av_log2(atempo->window); 271 pot = 1 << nlevels; 272 av_assert0(pot <= atempo->window); 273 274 if (pot < atempo->window) { 275 atempo->window = pot * 2; 276 nlevels++; 277 } 278 279 // initialize audio fragment buffers: 280 RE_MALLOC_OR_FAIL(atempo->frag[0].data, atempo->window * atempo->stride); 281 RE_MALLOC_OR_FAIL(atempo->frag[1].data, atempo->window * atempo->stride); 282 RE_MALLOC_OR_FAIL(atempo->frag[0].xdat, atempo->window * sizeof(FFTComplex)); 283 RE_MALLOC_OR_FAIL(atempo->frag[1].xdat, atempo->window * sizeof(FFTComplex)); 284 285 // initialize rDFT contexts: 286 av_rdft_end(atempo->real_to_complex); 287 atempo->real_to_complex = NULL; 288 289 av_rdft_end(atempo->complex_to_real); 290 atempo->complex_to_real = NULL; 291 292 atempo->real_to_complex = av_rdft_init(nlevels + 1, DFT_R2C); 293 if (!atempo->real_to_complex) { 294 yae_release_buffers(atempo); 295 return AVERROR(ENOMEM); 296 } 297 298 atempo->complex_to_real = av_rdft_init(nlevels + 1, IDFT_C2R); 299 if (!atempo->complex_to_real) { 300 yae_release_buffers(atempo); 301 return AVERROR(ENOMEM); 302 } 303 304 RE_MALLOC_OR_FAIL(atempo->correlation, atempo->window * sizeof(FFTComplex)); 305 306 atempo->ring = atempo->window * 3; 307 RE_MALLOC_OR_FAIL(atempo->buffer, atempo->ring * atempo->stride); 308 309 // initialize the Hann window function: 310 RE_MALLOC_OR_FAIL(atempo->hann, atempo->window * sizeof(float)); 311 312 for (i = 0; i < atempo->window; i++) { 313 double t = (double)i / (double)(atempo->window - 1); 314 double h = 0.5 * (1.0 - cos(2.0 * M_PI * t)); 315 atempo->hann[i] = (float)h; 316 } 317 318 yae_clear(atempo); 319 return 0; 320} 321 322static int yae_set_tempo(AVFilterContext *ctx, const char *arg_tempo) 323{ 324 const AudioFragment *prev; 325 ATempoContext *atempo = ctx->priv; 326 char *tail = NULL; 327 double tempo = av_strtod(arg_tempo, &tail); 328 329 if (tail && *tail) { 330 av_log(ctx, AV_LOG_ERROR, "Invalid tempo value '%s'\n", arg_tempo); 331 return AVERROR(EINVAL); 332 } 333 334 if (tempo < 0.5 || tempo > 2.0) { 335 av_log(ctx, AV_LOG_ERROR, "Tempo value %f exceeds [0.5, 2.0] range\n", 336 tempo); 337 return AVERROR(EINVAL); 338 } 339 340 prev = yae_prev_frag(atempo); 341 atempo->origin[0] = prev->position[0] + atempo->window / 2; 342 atempo->origin[1] = prev->position[1] + atempo->window / 2; 343 atempo->tempo = tempo; 344 return 0; 345} 346 347/** 348 * A helper macro for initializing complex data buffer with scalar data 349 * of a given type. 350 */ 351#define yae_init_xdat(scalar_type, scalar_max) \ 352 do { \ 353 const uint8_t *src_end = src + \ 354 frag->nsamples * atempo->channels * sizeof(scalar_type); \ 355 \ 356 FFTSample *xdat = frag->xdat; \ 357 scalar_type tmp; \ 358 \ 359 if (atempo->channels == 1) { \ 360 for (; src < src_end; xdat++) { \ 361 tmp = *(const scalar_type *)src; \ 362 src += sizeof(scalar_type); \ 363 \ 364 *xdat = (FFTSample)tmp; \ 365 } \ 366 } else { \ 367 FFTSample s, max, ti, si; \ 368 int i; \ 369 \ 370 for (; src < src_end; xdat++) { \ 371 tmp = *(const scalar_type *)src; \ 372 src += sizeof(scalar_type); \ 373 \ 374 max = (FFTSample)tmp; \ 375 s = FFMIN((FFTSample)scalar_max, \ 376 (FFTSample)fabsf(max)); \ 377 \ 378 for (i = 1; i < atempo->channels; i++) { \ 379 tmp = *(const scalar_type *)src; \ 380 src += sizeof(scalar_type); \ 381 \ 382 ti = (FFTSample)tmp; \ 383 si = FFMIN((FFTSample)scalar_max, \ 384 (FFTSample)fabsf(ti)); \ 385 \ 386 if (s < si) { \ 387 s = si; \ 388 max = ti; \ 389 } \ 390 } \ 391 \ 392 *xdat = max; \ 393 } \ 394 } \ 395 } while (0) 396 397/** 398 * Initialize complex data buffer of a given audio fragment 399 * with down-mixed mono data of appropriate scalar type. 400 */ 401static void yae_downmix(ATempoContext *atempo, AudioFragment *frag) 402{ 403 // shortcuts: 404 const uint8_t *src = frag->data; 405 406 // init complex data buffer used for FFT and Correlation: 407 memset(frag->xdat, 0, sizeof(FFTComplex) * atempo->window); 408 409 if (atempo->format == AV_SAMPLE_FMT_U8) { 410 yae_init_xdat(uint8_t, 127); 411 } else if (atempo->format == AV_SAMPLE_FMT_S16) { 412 yae_init_xdat(int16_t, 32767); 413 } else if (atempo->format == AV_SAMPLE_FMT_S32) { 414 yae_init_xdat(int, 2147483647); 415 } else if (atempo->format == AV_SAMPLE_FMT_FLT) { 416 yae_init_xdat(float, 1); 417 } else if (atempo->format == AV_SAMPLE_FMT_DBL) { 418 yae_init_xdat(double, 1); 419 } 420} 421 422/** 423 * Populate the internal data buffer on as-needed basis. 424 * 425 * @return 426 * 0 if requested data was already available or was successfully loaded, 427 * AVERROR(EAGAIN) if more input data is required. 428 */ 429static int yae_load_data(ATempoContext *atempo, 430 const uint8_t **src_ref, 431 const uint8_t *src_end, 432 int64_t stop_here) 433{ 434 // shortcut: 435 const uint8_t *src = *src_ref; 436 const int read_size = stop_here - atempo->position[0]; 437 438 if (stop_here <= atempo->position[0]) { 439 return 0; 440 } 441 442 // samples are not expected to be skipped: 443 av_assert0(read_size <= atempo->ring); 444 445 while (atempo->position[0] < stop_here && src < src_end) { 446 int src_samples = (src_end - src) / atempo->stride; 447 448 // load data piece-wise, in order to avoid complicating the logic: 449 int nsamples = FFMIN(read_size, src_samples); 450 int na; 451 int nb; 452 453 nsamples = FFMIN(nsamples, atempo->ring); 454 na = FFMIN(nsamples, atempo->ring - atempo->tail); 455 nb = FFMIN(nsamples - na, atempo->ring); 456 457 if (na) { 458 uint8_t *a = atempo->buffer + atempo->tail * atempo->stride; 459 memcpy(a, src, na * atempo->stride); 460 461 src += na * atempo->stride; 462 atempo->position[0] += na; 463 464 atempo->size = FFMIN(atempo->size + na, atempo->ring); 465 atempo->tail = (atempo->tail + na) % atempo->ring; 466 atempo->head = 467 atempo->size < atempo->ring ? 468 atempo->tail - atempo->size : 469 atempo->tail; 470 } 471 472 if (nb) { 473 uint8_t *b = atempo->buffer; 474 memcpy(b, src, nb * atempo->stride); 475 476 src += nb * atempo->stride; 477 atempo->position[0] += nb; 478 479 atempo->size = FFMIN(atempo->size + nb, atempo->ring); 480 atempo->tail = (atempo->tail + nb) % atempo->ring; 481 atempo->head = 482 atempo->size < atempo->ring ? 483 atempo->tail - atempo->size : 484 atempo->tail; 485 } 486 } 487 488 // pass back the updated source buffer pointer: 489 *src_ref = src; 490 491 // sanity check: 492 av_assert0(atempo->position[0] <= stop_here); 493 494 return atempo->position[0] == stop_here ? 0 : AVERROR(EAGAIN); 495} 496 497/** 498 * Populate current audio fragment data buffer. 499 * 500 * @return 501 * 0 when the fragment is ready, 502 * AVERROR(EAGAIN) if more input data is required. 503 */ 504static int yae_load_frag(ATempoContext *atempo, 505 const uint8_t **src_ref, 506 const uint8_t *src_end) 507{ 508 // shortcuts: 509 AudioFragment *frag = yae_curr_frag(atempo); 510 uint8_t *dst; 511 int64_t missing, start, zeros; 512 uint32_t nsamples; 513 const uint8_t *a, *b; 514 int i0, i1, n0, n1, na, nb; 515 516 int64_t stop_here = frag->position[0] + atempo->window; 517 if (src_ref && yae_load_data(atempo, src_ref, src_end, stop_here) != 0) { 518 return AVERROR(EAGAIN); 519 } 520 521 // calculate the number of samples we don't have: 522 missing = 523 stop_here > atempo->position[0] ? 524 stop_here - atempo->position[0] : 0; 525 526 nsamples = 527 missing < (int64_t)atempo->window ? 528 (uint32_t)(atempo->window - missing) : 0; 529 530 // setup the output buffer: 531 frag->nsamples = nsamples; 532 dst = frag->data; 533 534 start = atempo->position[0] - atempo->size; 535 zeros = 0; 536 537 if (frag->position[0] < start) { 538 // what we don't have we substitute with zeros: 539 zeros = FFMIN(start - frag->position[0], (int64_t)nsamples); 540 av_assert0(zeros != nsamples); 541 542 memset(dst, 0, zeros * atempo->stride); 543 dst += zeros * atempo->stride; 544 } 545 546 if (zeros == nsamples) { 547 return 0; 548 } 549 550 // get the remaining data from the ring buffer: 551 na = (atempo->head < atempo->tail ? 552 atempo->tail - atempo->head : 553 atempo->ring - atempo->head); 554 555 nb = atempo->head < atempo->tail ? 0 : atempo->tail; 556 557 // sanity check: 558 av_assert0(nsamples <= zeros + na + nb); 559 560 a = atempo->buffer + atempo->head * atempo->stride; 561 b = atempo->buffer; 562 563 i0 = frag->position[0] + zeros - start; 564 i1 = i0 < na ? 0 : i0 - na; 565 566 n0 = i0 < na ? FFMIN(na - i0, (int)(nsamples - zeros)) : 0; 567 n1 = nsamples - zeros - n0; 568 569 if (n0) { 570 memcpy(dst, a + i0 * atempo->stride, n0 * atempo->stride); 571 dst += n0 * atempo->stride; 572 } 573 574 if (n1) { 575 memcpy(dst, b + i1 * atempo->stride, n1 * atempo->stride); 576 } 577 578 return 0; 579} 580 581/** 582 * Prepare for loading next audio fragment. 583 */ 584static void yae_advance_to_next_frag(ATempoContext *atempo) 585{ 586 const double fragment_step = atempo->tempo * (double)(atempo->window / 2); 587 588 const AudioFragment *prev; 589 AudioFragment *frag; 590 591 atempo->nfrag++; 592 prev = yae_prev_frag(atempo); 593 frag = yae_curr_frag(atempo); 594 595 frag->position[0] = prev->position[0] + (int64_t)fragment_step; 596 frag->position[1] = prev->position[1] + atempo->window / 2; 597 frag->nsamples = 0; 598} 599 600/** 601 * Calculate cross-correlation via rDFT. 602 * 603 * Multiply two vectors of complex numbers (result of real_to_complex rDFT) 604 * and transform back via complex_to_real rDFT. 605 */ 606static void yae_xcorr_via_rdft(FFTSample *xcorr, 607 RDFTContext *complex_to_real, 608 const FFTComplex *xa, 609 const FFTComplex *xb, 610 const int window) 611{ 612 FFTComplex *xc = (FFTComplex *)xcorr; 613 int i; 614 615 // NOTE: first element requires special care -- Given Y = rDFT(X), 616 // Im(Y[0]) and Im(Y[N/2]) are always zero, therefore av_rdft_calc 617 // stores Re(Y[N/2]) in place of Im(Y[0]). 618 619 xc->re = xa->re * xb->re; 620 xc->im = xa->im * xb->im; 621 xa++; 622 xb++; 623 xc++; 624 625 for (i = 1; i < window; i++, xa++, xb++, xc++) { 626 xc->re = (xa->re * xb->re + xa->im * xb->im); 627 xc->im = (xa->im * xb->re - xa->re * xb->im); 628 } 629 630 // apply inverse rDFT: 631 av_rdft_calc(complex_to_real, xcorr); 632} 633 634/** 635 * Calculate alignment offset for given fragment 636 * relative to the previous fragment. 637 * 638 * @return alignment offset of current fragment relative to previous. 639 */ 640static int yae_align(AudioFragment *frag, 641 const AudioFragment *prev, 642 const int window, 643 const int delta_max, 644 const int drift, 645 FFTSample *correlation, 646 RDFTContext *complex_to_real) 647{ 648 int best_offset = -drift; 649 FFTSample best_metric = -FLT_MAX; 650 FFTSample *xcorr; 651 652 int i0; 653 int i1; 654 int i; 655 656 yae_xcorr_via_rdft(correlation, 657 complex_to_real, 658 (const FFTComplex *)prev->xdat, 659 (const FFTComplex *)frag->xdat, 660 window); 661 662 // identify search window boundaries: 663 i0 = FFMAX(window / 2 - delta_max - drift, 0); 664 i0 = FFMIN(i0, window); 665 666 i1 = FFMIN(window / 2 + delta_max - drift, window - window / 16); 667 i1 = FFMAX(i1, 0); 668 669 // identify cross-correlation peaks within search window: 670 xcorr = correlation + i0; 671 672 for (i = i0; i < i1; i++, xcorr++) { 673 FFTSample metric = *xcorr; 674 675 // normalize: 676 FFTSample drifti = (FFTSample)(drift + i); 677 metric *= drifti * (FFTSample)(i - i0) * (FFTSample)(i1 - i); 678 679 if (metric > best_metric) { 680 best_metric = metric; 681 best_offset = i - window / 2; 682 } 683 } 684 685 return best_offset; 686} 687 688/** 689 * Adjust current fragment position for better alignment 690 * with previous fragment. 691 * 692 * @return alignment correction. 693 */ 694static int yae_adjust_position(ATempoContext *atempo) 695{ 696 const AudioFragment *prev = yae_prev_frag(atempo); 697 AudioFragment *frag = yae_curr_frag(atempo); 698 699 const double prev_output_position = 700 (double)(prev->position[1] - atempo->origin[1] + atempo->window / 2); 701 702 const double ideal_output_position = 703 (double)(prev->position[0] - atempo->origin[0] + atempo->window / 2) / 704 atempo->tempo; 705 706 const int drift = (int)(prev_output_position - ideal_output_position); 707 708 const int delta_max = atempo->window / 2; 709 const int correction = yae_align(frag, 710 prev, 711 atempo->window, 712 delta_max, 713 drift, 714 atempo->correlation, 715 atempo->complex_to_real); 716 717 if (correction) { 718 // adjust fragment position: 719 frag->position[0] -= correction; 720 721 // clear so that the fragment can be reloaded: 722 frag->nsamples = 0; 723 } 724 725 return correction; 726} 727 728/** 729 * A helper macro for blending the overlap region of previous 730 * and current audio fragment. 731 */ 732#define yae_blend(scalar_type) \ 733 do { \ 734 const scalar_type *aaa = (const scalar_type *)a; \ 735 const scalar_type *bbb = (const scalar_type *)b; \ 736 \ 737 scalar_type *out = (scalar_type *)dst; \ 738 scalar_type *out_end = (scalar_type *)dst_end; \ 739 int64_t i; \ 740 \ 741 for (i = 0; i < overlap && out < out_end; \ 742 i++, atempo->position[1]++, wa++, wb++) { \ 743 float w0 = *wa; \ 744 float w1 = *wb; \ 745 int j; \ 746 \ 747 for (j = 0; j < atempo->channels; \ 748 j++, aaa++, bbb++, out++) { \ 749 float t0 = (float)*aaa; \ 750 float t1 = (float)*bbb; \ 751 \ 752 *out = \ 753 frag->position[0] + i < 0 ? \ 754 *aaa : \ 755 (scalar_type)(t0 * w0 + t1 * w1); \ 756 } \ 757 } \ 758 dst = (uint8_t *)out; \ 759 } while (0) 760 761/** 762 * Blend the overlap region of previous and current audio fragment 763 * and output the results to the given destination buffer. 764 * 765 * @return 766 * 0 if the overlap region was completely stored in the dst buffer, 767 * AVERROR(EAGAIN) if more destination buffer space is required. 768 */ 769static int yae_overlap_add(ATempoContext *atempo, 770 uint8_t **dst_ref, 771 uint8_t *dst_end) 772{ 773 // shortcuts: 774 const AudioFragment *prev = yae_prev_frag(atempo); 775 const AudioFragment *frag = yae_curr_frag(atempo); 776 777 const int64_t start_here = FFMAX(atempo->position[1], 778 frag->position[1]); 779 780 const int64_t stop_here = FFMIN(prev->position[1] + prev->nsamples, 781 frag->position[1] + frag->nsamples); 782 783 const int64_t overlap = stop_here - start_here; 784 785 const int64_t ia = start_here - prev->position[1]; 786 const int64_t ib = start_here - frag->position[1]; 787 788 const float *wa = atempo->hann + ia; 789 const float *wb = atempo->hann + ib; 790 791 const uint8_t *a = prev->data + ia * atempo->stride; 792 const uint8_t *b = frag->data + ib * atempo->stride; 793 794 uint8_t *dst = *dst_ref; 795 796 av_assert0(start_here <= stop_here && 797 frag->position[1] <= start_here && 798 overlap <= frag->nsamples); 799 800 if (atempo->format == AV_SAMPLE_FMT_U8) { 801 yae_blend(uint8_t); 802 } else if (atempo->format == AV_SAMPLE_FMT_S16) { 803 yae_blend(int16_t); 804 } else if (atempo->format == AV_SAMPLE_FMT_S32) { 805 yae_blend(int); 806 } else if (atempo->format == AV_SAMPLE_FMT_FLT) { 807 yae_blend(float); 808 } else if (atempo->format == AV_SAMPLE_FMT_DBL) { 809 yae_blend(double); 810 } 811 812 // pass-back the updated destination buffer pointer: 813 *dst_ref = dst; 814 815 return atempo->position[1] == stop_here ? 0 : AVERROR(EAGAIN); 816} 817 818/** 819 * Feed as much data to the filter as it is able to consume 820 * and receive as much processed data in the destination buffer 821 * as it is able to produce or store. 822 */ 823static void 824yae_apply(ATempoContext *atempo, 825 const uint8_t **src_ref, 826 const uint8_t *src_end, 827 uint8_t **dst_ref, 828 uint8_t *dst_end) 829{ 830 while (1) { 831 if (atempo->state == YAE_LOAD_FRAGMENT) { 832 // load additional data for the current fragment: 833 if (yae_load_frag(atempo, src_ref, src_end) != 0) { 834 break; 835 } 836 837 // down-mix to mono: 838 yae_downmix(atempo, yae_curr_frag(atempo)); 839 840 // apply rDFT: 841 av_rdft_calc(atempo->real_to_complex, yae_curr_frag(atempo)->xdat); 842 843 // must load the second fragment before alignment can start: 844 if (!atempo->nfrag) { 845 yae_advance_to_next_frag(atempo); 846 continue; 847 } 848 849 atempo->state = YAE_ADJUST_POSITION; 850 } 851 852 if (atempo->state == YAE_ADJUST_POSITION) { 853 // adjust position for better alignment: 854 if (yae_adjust_position(atempo)) { 855 // reload the fragment at the corrected position, so that the 856 // Hann window blending would not require normalization: 857 atempo->state = YAE_RELOAD_FRAGMENT; 858 } else { 859 atempo->state = YAE_OUTPUT_OVERLAP_ADD; 860 } 861 } 862 863 if (atempo->state == YAE_RELOAD_FRAGMENT) { 864 // load additional data if necessary due to position adjustment: 865 if (yae_load_frag(atempo, src_ref, src_end) != 0) { 866 break; 867 } 868 869 // down-mix to mono: 870 yae_downmix(atempo, yae_curr_frag(atempo)); 871 872 // apply rDFT: 873 av_rdft_calc(atempo->real_to_complex, yae_curr_frag(atempo)->xdat); 874 875 atempo->state = YAE_OUTPUT_OVERLAP_ADD; 876 } 877 878 if (atempo->state == YAE_OUTPUT_OVERLAP_ADD) { 879 // overlap-add and output the result: 880 if (yae_overlap_add(atempo, dst_ref, dst_end) != 0) { 881 break; 882 } 883 884 // advance to the next fragment, repeat: 885 yae_advance_to_next_frag(atempo); 886 atempo->state = YAE_LOAD_FRAGMENT; 887 } 888 } 889} 890 891/** 892 * Flush any buffered data from the filter. 893 * 894 * @return 895 * 0 if all data was completely stored in the dst buffer, 896 * AVERROR(EAGAIN) if more destination buffer space is required. 897 */ 898static int yae_flush(ATempoContext *atempo, 899 uint8_t **dst_ref, 900 uint8_t *dst_end) 901{ 902 AudioFragment *frag = yae_curr_frag(atempo); 903 int64_t overlap_end; 904 int64_t start_here; 905 int64_t stop_here; 906 int64_t offset; 907 908 const uint8_t *src; 909 uint8_t *dst; 910 911 int src_size; 912 int dst_size; 913 int nbytes; 914 915 atempo->state = YAE_FLUSH_OUTPUT; 916 917 if (atempo->position[0] == frag->position[0] + frag->nsamples && 918 atempo->position[1] == frag->position[1] + frag->nsamples) { 919 // the current fragment is already flushed: 920 return 0; 921 } 922 923 if (frag->position[0] + frag->nsamples < atempo->position[0]) { 924 // finish loading the current (possibly partial) fragment: 925 yae_load_frag(atempo, NULL, NULL); 926 927 if (atempo->nfrag) { 928 // down-mix to mono: 929 yae_downmix(atempo, frag); 930 931 // apply rDFT: 932 av_rdft_calc(atempo->real_to_complex, frag->xdat); 933 934 // align current fragment to previous fragment: 935 if (yae_adjust_position(atempo)) { 936 // reload the current fragment due to adjusted position: 937 yae_load_frag(atempo, NULL, NULL); 938 } 939 } 940 } 941 942 // flush the overlap region: 943 overlap_end = frag->position[1] + FFMIN(atempo->window / 2, 944 frag->nsamples); 945 946 while (atempo->position[1] < overlap_end) { 947 if (yae_overlap_add(atempo, dst_ref, dst_end) != 0) { 948 return AVERROR(EAGAIN); 949 } 950 } 951 952 // flush the remaininder of the current fragment: 953 start_here = FFMAX(atempo->position[1], overlap_end); 954 stop_here = frag->position[1] + frag->nsamples; 955 offset = start_here - frag->position[1]; 956 av_assert0(start_here <= stop_here && frag->position[1] <= start_here); 957 958 src = frag->data + offset * atempo->stride; 959 dst = (uint8_t *)*dst_ref; 960 961 src_size = (int)(stop_here - start_here) * atempo->stride; 962 dst_size = dst_end - dst; 963 nbytes = FFMIN(src_size, dst_size); 964 965 memcpy(dst, src, nbytes); 966 dst += nbytes; 967 968 atempo->position[1] += (nbytes / atempo->stride); 969 970 // pass-back the updated destination buffer pointer: 971 *dst_ref = (uint8_t *)dst; 972 973 return atempo->position[1] == stop_here ? 0 : AVERROR(EAGAIN); 974} 975 976static av_cold int init(AVFilterContext *ctx) 977{ 978 ATempoContext *atempo = ctx->priv; 979 atempo->format = AV_SAMPLE_FMT_NONE; 980 atempo->state = YAE_LOAD_FRAGMENT; 981 return 0; 982} 983 984static av_cold void uninit(AVFilterContext *ctx) 985{ 986 ATempoContext *atempo = ctx->priv; 987 yae_release_buffers(atempo); 988} 989 990static int query_formats(AVFilterContext *ctx) 991{ 992 AVFilterChannelLayouts *layouts = NULL; 993 AVFilterFormats *formats = NULL; 994 995 // WSOLA necessitates an internal sliding window ring buffer 996 // for incoming audio stream. 997 // 998 // Planar sample formats are too cumbersome to store in a ring buffer, 999 // therefore planar sample formats are not supported. 1000 // 1001 static const enum AVSampleFormat sample_fmts[] = { 1002 AV_SAMPLE_FMT_U8, 1003 AV_SAMPLE_FMT_S16, 1004 AV_SAMPLE_FMT_S32, 1005 AV_SAMPLE_FMT_FLT, 1006 AV_SAMPLE_FMT_DBL, 1007 AV_SAMPLE_FMT_NONE 1008 }; 1009 1010 layouts = ff_all_channel_layouts(); 1011 if (!layouts) { 1012 return AVERROR(ENOMEM); 1013 } 1014 ff_set_common_channel_layouts(ctx, layouts); 1015 1016 formats = ff_make_format_list(sample_fmts); 1017 if (!formats) { 1018 return AVERROR(ENOMEM); 1019 } 1020 ff_set_common_formats(ctx, formats); 1021 1022 formats = ff_all_samplerates(); 1023 if (!formats) { 1024 return AVERROR(ENOMEM); 1025 } 1026 ff_set_common_samplerates(ctx, formats); 1027 1028 return 0; 1029} 1030 1031static int config_props(AVFilterLink *inlink) 1032{ 1033 AVFilterContext *ctx = inlink->dst; 1034 ATempoContext *atempo = ctx->priv; 1035 1036 enum AVSampleFormat format = inlink->format; 1037 int sample_rate = (int)inlink->sample_rate; 1038 int channels = av_get_channel_layout_nb_channels(inlink->channel_layout); 1039 1040 ctx->outputs[0]->flags |= FF_LINK_FLAG_REQUEST_LOOP; 1041 1042 return yae_reset(atempo, format, sample_rate, channels); 1043} 1044 1045static int push_samples(ATempoContext *atempo, 1046 AVFilterLink *outlink, 1047 int n_out) 1048{ 1049 int ret; 1050 1051 atempo->dst_buffer->sample_rate = outlink->sample_rate; 1052 atempo->dst_buffer->nb_samples = n_out; 1053 1054 // adjust the PTS: 1055 atempo->dst_buffer->pts = 1056 av_rescale_q(atempo->nsamples_out, 1057 (AVRational){ 1, outlink->sample_rate }, 1058 outlink->time_base); 1059 1060 ret = ff_filter_frame(outlink, atempo->dst_buffer); 1061 atempo->dst_buffer = NULL; 1062 atempo->dst = NULL; 1063 atempo->dst_end = NULL; 1064 if (ret < 0) 1065 return ret; 1066 1067 atempo->nsamples_out += n_out; 1068 return 0; 1069} 1070 1071static int filter_frame(AVFilterLink *inlink, AVFrame *src_buffer) 1072{ 1073 AVFilterContext *ctx = inlink->dst; 1074 ATempoContext *atempo = ctx->priv; 1075 AVFilterLink *outlink = ctx->outputs[0]; 1076 1077 int ret = 0; 1078 int n_in = src_buffer->nb_samples; 1079 int n_out = (int)(0.5 + ((double)n_in) / atempo->tempo); 1080 1081 const uint8_t *src = src_buffer->data[0]; 1082 const uint8_t *src_end = src + n_in * atempo->stride; 1083 1084 while (src < src_end) { 1085 if (!atempo->dst_buffer) { 1086 atempo->dst_buffer = ff_get_audio_buffer(outlink, n_out); 1087 if (!atempo->dst_buffer) 1088 return AVERROR(ENOMEM); 1089 av_frame_copy_props(atempo->dst_buffer, src_buffer); 1090 1091 atempo->dst = atempo->dst_buffer->data[0]; 1092 atempo->dst_end = atempo->dst + n_out * atempo->stride; 1093 } 1094 1095 yae_apply(atempo, &src, src_end, &atempo->dst, atempo->dst_end); 1096 1097 if (atempo->dst == atempo->dst_end) { 1098 int n_samples = ((atempo->dst - atempo->dst_buffer->data[0]) / 1099 atempo->stride); 1100 ret = push_samples(atempo, outlink, n_samples); 1101 if (ret < 0) 1102 goto end; 1103 } 1104 } 1105 1106 atempo->nsamples_in += n_in; 1107end: 1108 av_frame_free(&src_buffer); 1109 return ret; 1110} 1111 1112static int request_frame(AVFilterLink *outlink) 1113{ 1114 AVFilterContext *ctx = outlink->src; 1115 ATempoContext *atempo = ctx->priv; 1116 int ret; 1117 1118 ret = ff_request_frame(ctx->inputs[0]); 1119 1120 if (ret == AVERROR_EOF) { 1121 // flush the filter: 1122 int n_max = atempo->ring; 1123 int n_out; 1124 int err = AVERROR(EAGAIN); 1125 1126 while (err == AVERROR(EAGAIN)) { 1127 if (!atempo->dst_buffer) { 1128 atempo->dst_buffer = ff_get_audio_buffer(outlink, n_max); 1129 if (!atempo->dst_buffer) 1130 return AVERROR(ENOMEM); 1131 1132 atempo->dst = atempo->dst_buffer->data[0]; 1133 atempo->dst_end = atempo->dst + n_max * atempo->stride; 1134 } 1135 1136 err = yae_flush(atempo, &atempo->dst, atempo->dst_end); 1137 1138 n_out = ((atempo->dst - atempo->dst_buffer->data[0]) / 1139 atempo->stride); 1140 1141 if (n_out) { 1142 ret = push_samples(atempo, outlink, n_out); 1143 } 1144 } 1145 1146 av_frame_free(&atempo->dst_buffer); 1147 atempo->dst = NULL; 1148 atempo->dst_end = NULL; 1149 1150 return AVERROR_EOF; 1151 } 1152 1153 return ret; 1154} 1155 1156static int process_command(AVFilterContext *ctx, 1157 const char *cmd, 1158 const char *arg, 1159 char *res, 1160 int res_len, 1161 int flags) 1162{ 1163 return !strcmp(cmd, "tempo") ? yae_set_tempo(ctx, arg) : AVERROR(ENOSYS); 1164} 1165 1166static const AVFilterPad atempo_inputs[] = { 1167 { 1168 .name = "default", 1169 .type = AVMEDIA_TYPE_AUDIO, 1170 .filter_frame = filter_frame, 1171 .config_props = config_props, 1172 }, 1173 { NULL } 1174}; 1175 1176static const AVFilterPad atempo_outputs[] = { 1177 { 1178 .name = "default", 1179 .request_frame = request_frame, 1180 .type = AVMEDIA_TYPE_AUDIO, 1181 }, 1182 { NULL } 1183}; 1184 1185AVFilter ff_af_atempo = { 1186 .name = "atempo", 1187 .description = NULL_IF_CONFIG_SMALL("Adjust audio tempo."), 1188 .init = init, 1189 .uninit = uninit, 1190 .query_formats = query_formats, 1191 .process_command = process_command, 1192 .priv_size = sizeof(ATempoContext), 1193 .priv_class = &atempo_class, 1194 .inputs = atempo_inputs, 1195 .outputs = atempo_outputs, 1196}; 1197