1/******************************************************************** 2 * * 3 * THIS FILE IS PART OF THE OggVorbis SOFTWARE CODEC SOURCE CODE. * 4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * 5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * 6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * 7 * * 8 * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2001 * 9 * by the Xiph.Org Foundation http://www.xiph.org/ * 10 * * 11 ******************************************************************** 12 13 function: utility for paring low hit count cells from lattice codebook 14 last mod: $Id: latticepare.c 16037 2009-05-26 21:10:58Z xiphmont $ 15 16 ********************************************************************/ 17 18#include <stdlib.h> 19#include <stdio.h> 20#include <math.h> 21#include <string.h> 22#include <errno.h> 23#include "../lib/scales.h" 24#include "bookutil.h" 25#include "vqgen.h" 26#include "vqsplit.h" 27#include "../lib/os.h" 28 29/* Lattice codebooks have two strengths: important fetaures that are 30 poorly modelled by global error minimization training (eg, strong 31 peaks) are not neglected 2) compact quantized representation. 32 33 A fully populated lattice codebook, however, swings point 1 too far 34 in the opposite direction; rare features need not be modelled quite 35 so religiously and as such, we waste bits unless we eliminate the 36 least common cells. The codebook rep supports unused cells, so we 37 need to tag such cells and build an auxiliary (non-thresh) search 38 mechanism to find the proper match quickly */ 39 40/* two basic steps; first is pare the cell for which dispersal creates 41 the least additional error. This will naturally choose 42 low-population cells and cells that have not taken on points from 43 neighboring paring (but does not result in the lattice collapsing 44 inward and leaving low population ares totally unmodelled). After 45 paring has removed the desired number of cells, we need to build an 46 auxiliary search for each culled point */ 47 48/* Although lattice books (due to threshhold-based matching) do not 49 actually use error to make cell selections (in fact, it need not 50 bear any relation), the 'secondbest' entry finder here is in fact 51 error/distance based, so latticepare is only useful on such books */ 52 53/* command line: 54 latticepare latticebook.vqh input_data.vqd <target_cells> 55 56 produces a new output book on stdout 57*/ 58 59static float _dist(int el,float *a, float *b){ 60 int i; 61 float acc=0.f; 62 for(i=0;i<el;i++){ 63 float val=(a[i]-b[i]); 64 acc+=val*val; 65 } 66 return(acc); 67} 68 69static float *pointlist; 70static long points=0; 71 72void add_vector(codebook *b,float *vec,long n){ 73 int dim=b->dim,i,j; 74 int step=n/dim; 75 for(i=0;i<step;i++){ 76 for(j=i;j<n;j+=step){ 77 pointlist[points++]=vec[j]; 78 } 79 } 80} 81 82static int bestm(codebook *b,float *vec){ 83 encode_aux_threshmatch *tt=b->c->thresh_tree; 84 int dim=b->dim; 85 int i,k,o; 86 int best=0; 87 88 /* what would be the closest match if the codebook was fully 89 populated? */ 90 91 for(k=0,o=dim-1;k<dim;k++,o--){ 92 int i; 93 for(i=0;i<tt->threshvals-1;i++) 94 if(vec[o]<tt->quantthresh[i])break; 95 best=(best*tt->quantvals)+tt->quantmap[i]; 96 } 97 return(best); 98} 99 100static int closest(codebook *b,float *vec,int current){ 101 encode_aux_threshmatch *tt=b->c->thresh_tree; 102 int dim=b->dim; 103 int i,k,o; 104 105 float bestmetric=0; 106 int bestentry=-1; 107 int best=bestm(b,vec); 108 109 if(current<0 && b->c->lengthlist[best]>0)return best; 110 111 for(i=0;i<b->entries;i++){ 112 if(b->c->lengthlist[i]>0 && i!=best && i!=current){ 113 float thismetric=_dist(dim, vec, b->valuelist+i*dim); 114 if(bestentry==-1 || thismetric<bestmetric){ 115 bestentry=i; 116 bestmetric=thismetric; 117 } 118 } 119 } 120 121 return(bestentry); 122} 123 124static float _heuristic(codebook *b,float *ppt,int secondbest){ 125 float *secondcell=b->valuelist+secondbest*b->dim; 126 int best=bestm(b,ppt); 127 float *firstcell=b->valuelist+best*b->dim; 128 float error=_dist(b->dim,firstcell,secondcell); 129 float *zero=alloca(b->dim*sizeof(float)); 130 float fromzero; 131 132 memset(zero,0,b->dim*sizeof(float)); 133 fromzero=sqrt(_dist(b->dim,firstcell,zero)); 134 135 return(error/fromzero); 136} 137 138static int longsort(const void *a, const void *b){ 139 return **(long **)b-**(long **)a; 140} 141 142void usage(void){ 143 fprintf(stderr,"Ogg/Vorbis lattice codebook paring utility\n\n" 144 "usage: latticepare book.vqh data.vqd <target_cells> <protected_cells> base\n" 145 "where <target_cells> is the desired number of final cells (or -1\n" 146 " for no change)\n" 147 " <protected_cells> is the number of highest-hit count cells\n" 148 " to protect from dispersal\n" 149 " base is the base name (not including .vqh) of the new\n" 150 " book\n\n"); 151 exit(1); 152} 153 154int main(int argc,char *argv[]){ 155 char *basename; 156 codebook *b=NULL; 157 int entries=0; 158 int dim=0; 159 long i,j,target=-1,protect=-1; 160 FILE *out=NULL; 161 162 int argnum=0; 163 164 argv++; 165 if(*argv==NULL){ 166 usage(); 167 exit(1); 168 } 169 170 while(*argv){ 171 if(*argv[0]=='-'){ 172 173 argv++; 174 175 }else{ 176 switch (argnum++){ 177 case 0:case 1: 178 { 179 /* yes, this is evil. However, it's very convenient to parse file 180 extentions */ 181 182 /* input file. What kind? */ 183 char *dot; 184 char *ext=NULL; 185 char *name=strdup(*argv++); 186 dot=strrchr(name,'.'); 187 if(dot) 188 ext=dot+1; 189 else{ 190 ext=""; 191 192 } 193 194 195 /* codebook */ 196 if(!strcmp(ext,"vqh")){ 197 198 basename=strrchr(name,'/'); 199 if(basename) 200 basename=strdup(basename)+1; 201 else 202 basename=strdup(name); 203 dot=strrchr(basename,'.'); 204 if(dot)*dot='\0'; 205 206 b=codebook_load(name); 207 dim=b->dim; 208 entries=b->entries; 209 } 210 211 /* data file; we do actually need to suck it into memory */ 212 /* we're dealing with just one book, so we can de-interleave */ 213 if(!strcmp(ext,"vqd") && !points){ 214 int cols; 215 long lines=0; 216 char *line; 217 float *vec; 218 FILE *in=fopen(name,"r"); 219 if(!in){ 220 fprintf(stderr,"Could not open input file %s\n",name); 221 exit(1); 222 } 223 224 reset_next_value(); 225 line=setup_line(in); 226 /* count cols before we start reading */ 227 { 228 char *temp=line; 229 while(*temp==' ')temp++; 230 for(cols=0;*temp;cols++){ 231 while(*temp>32)temp++; 232 while(*temp==' ')temp++; 233 } 234 } 235 vec=alloca(cols*sizeof(float)); 236 /* count, then load, to avoid fragmenting the hell out of 237 memory */ 238 while(line){ 239 lines++; 240 for(j=0;j<cols;j++) 241 if(get_line_value(in,vec+j)){ 242 fprintf(stderr,"Too few columns on line %ld in data file\n",lines); 243 exit(1); 244 } 245 if((lines&0xff)==0)spinnit("counting samples...",lines*cols); 246 line=setup_line(in); 247 } 248 pointlist=_ogg_malloc((cols*lines+entries*dim)*sizeof(float)); 249 250 rewind(in); 251 line=setup_line(in); 252 while(line){ 253 lines--; 254 for(j=0;j<cols;j++) 255 if(get_line_value(in,vec+j)){ 256 fprintf(stderr,"Too few columns on line %ld in data file\n",lines); 257 exit(1); 258 } 259 /* deinterleave, add to heap */ 260 add_vector(b,vec,cols); 261 if((lines&0xff)==0)spinnit("loading samples...",lines*cols); 262 263 line=setup_line(in); 264 } 265 fclose(in); 266 } 267 } 268 break; 269 case 2: 270 target=atol(*argv++); 271 if(target==0)target=entries; 272 break; 273 case 3: 274 protect=atol(*argv++); 275 break; 276 case 4: 277 { 278 char *buff=alloca(strlen(*argv)+5); 279 sprintf(buff,"%s.vqh",*argv); 280 basename=*argv++; 281 282 out=fopen(buff,"w"); 283 if(!out){ 284 fprintf(stderr,"unable ot open %s for output",buff); 285 exit(1); 286 } 287 } 288 break; 289 default: 290 usage(); 291 } 292 } 293 } 294 if(!entries || !points || !out)usage(); 295 if(target==-1)usage(); 296 297 /* add guard points */ 298 for(i=0;i<entries;i++) 299 for(j=0;j<dim;j++) 300 pointlist[points++]=b->valuelist[i*dim+j]; 301 302 points/=dim; 303 304 /* set up auxiliary vectors for error tracking */ 305 { 306 encode_aux_nearestmatch *nt=NULL; 307 long pointssofar=0; 308 long *pointindex; 309 long indexedpoints=0; 310 long *entryindex; 311 long *reventry; 312 long *membership=_ogg_malloc(points*sizeof(long)); 313 long *firsthead=_ogg_malloc(entries*sizeof(long)); 314 long *secondary=_ogg_malloc(points*sizeof(long)); 315 long *secondhead=_ogg_malloc(entries*sizeof(long)); 316 317 long *cellcount=_ogg_calloc(entries,sizeof(long)); 318 long *cellcount2=_ogg_calloc(entries,sizeof(long)); 319 float *cellerror=_ogg_calloc(entries,sizeof(float)); 320 float *cellerrormax=_ogg_calloc(entries,sizeof(float)); 321 long cellsleft=entries; 322 for(i=0;i<points;i++)membership[i]=-1; 323 for(i=0;i<entries;i++)firsthead[i]=-1; 324 for(i=0;i<points;i++)secondary[i]=-1; 325 for(i=0;i<entries;i++)secondhead[i]=-1; 326 327 for(i=0;i<points;i++){ 328 /* assign vectors to the nearest cell. Also keep track of second 329 nearest for error statistics */ 330 float *ppt=pointlist+i*dim; 331 int firstentry=closest(b,ppt,-1); 332 int secondentry=closest(b,ppt,firstentry); 333 float firstmetric=_dist(dim,b->valuelist+dim*firstentry,ppt); 334 float secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt); 335 336 if(!(i&0xff))spinnit("initializing... ",points-i); 337 338 membership[i]=firsthead[firstentry]; 339 firsthead[firstentry]=i; 340 secondary[i]=secondhead[secondentry]; 341 secondhead[secondentry]=i; 342 343 if(i<points-entries){ 344 cellerror[firstentry]+=secondmetric-firstmetric; 345 cellerrormax[firstentry]=max(cellerrormax[firstentry], 346 _heuristic(b,ppt,secondentry)); 347 cellcount[firstentry]++; 348 cellcount2[secondentry]++; 349 } 350 } 351 352 /* which cells are most heavily populated? Protect as many from 353 dispersal as the user has requested */ 354 { 355 long **countindex=_ogg_calloc(entries,sizeof(long *)); 356 for(i=0;i<entries;i++)countindex[i]=cellcount+i; 357 qsort(countindex,entries,sizeof(long *),longsort); 358 for(i=0;i<protect;i++){ 359 int ptr=countindex[i]-cellcount; 360 cellerrormax[ptr]=9e50f; 361 } 362 } 363 364 { 365 fprintf(stderr,"\r"); 366 for(i=0;i<entries;i++){ 367 /* decompose index */ 368 int entry=i; 369 for(j=0;j<dim;j++){ 370 fprintf(stderr,"%d:",entry%b->c->thresh_tree->quantvals); 371 entry/=b->c->thresh_tree->quantvals; 372 } 373 374 fprintf(stderr,":%ld/%ld, ",cellcount[i],cellcount2[i]); 375 } 376 fprintf(stderr,"\n"); 377 } 378 379 /* do the automatic cull request */ 380 while(cellsleft>target){ 381 int bestcell=-1; 382 float besterror=0; 383 float besterror2=0; 384 long head=-1; 385 char spinbuf[80]; 386 sprintf(spinbuf,"cells left to eliminate: %ld : ",cellsleft-target); 387 388 /* find the cell with lowest removal impact */ 389 for(i=0;i<entries;i++){ 390 if(b->c->lengthlist[i]>0){ 391 if(bestcell==-1 || cellerrormax[i]<=besterror2){ 392 if(bestcell==-1 || cellerrormax[i]<besterror2 || 393 besterror>cellerror[i]){ 394 besterror=cellerror[i]; 395 besterror2=cellerrormax[i]; 396 bestcell=i; 397 } 398 } 399 } 400 } 401 402 fprintf(stderr,"\reliminating cell %d \n" 403 " dispersal error of %g max/%g total (%ld hits)\n", 404 bestcell,besterror2,besterror,cellcount[bestcell]); 405 406 /* disperse it. move each point out, adding it (properly) to 407 the second best */ 408 b->c->lengthlist[bestcell]=0; 409 head=firsthead[bestcell]; 410 firsthead[bestcell]=-1; 411 while(head!=-1){ 412 /* head is a point number */ 413 float *ppt=pointlist+head*dim; 414 int firstentry=closest(b,ppt,-1); 415 int secondentry=closest(b,ppt,firstentry); 416 float firstmetric=_dist(dim,b->valuelist+dim*firstentry,ppt); 417 float secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt); 418 long next=membership[head]; 419 420 if(head<points-entries){ 421 cellcount[firstentry]++; 422 cellcount[bestcell]--; 423 cellerror[firstentry]+=secondmetric-firstmetric; 424 cellerrormax[firstentry]=max(cellerrormax[firstentry], 425 _heuristic(b,ppt,secondentry)); 426 } 427 428 membership[head]=firsthead[firstentry]; 429 firsthead[firstentry]=head; 430 head=next; 431 if(cellcount[bestcell]%128==0) 432 spinnit(spinbuf,cellcount[bestcell]+cellcount2[bestcell]); 433 434 } 435 436 /* now see that all points that had the dispersed cell as second 437 choice have second choice reassigned */ 438 head=secondhead[bestcell]; 439 secondhead[bestcell]=-1; 440 while(head!=-1){ 441 float *ppt=pointlist+head*dim; 442 /* who are we assigned to now? */ 443 int firstentry=closest(b,ppt,-1); 444 /* what is the new second closest match? */ 445 int secondentry=closest(b,ppt,firstentry); 446 /* old second closest is the cell being disbanded */ 447 float oldsecondmetric=_dist(dim,b->valuelist+dim*bestcell,ppt); 448 /* new second closest error */ 449 float secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt); 450 long next=secondary[head]; 451 452 if(head<points-entries){ 453 cellcount2[secondentry]++; 454 cellcount2[bestcell]--; 455 cellerror[firstentry]+=secondmetric-oldsecondmetric; 456 cellerrormax[firstentry]=max(cellerrormax[firstentry], 457 _heuristic(b,ppt,secondentry)); 458 } 459 460 secondary[head]=secondhead[secondentry]; 461 secondhead[secondentry]=head; 462 head=next; 463 464 if(cellcount2[bestcell]%128==0) 465 spinnit(spinbuf,cellcount2[bestcell]); 466 } 467 468 cellsleft--; 469 } 470 471 /* paring is over. Build decision trees using points that now fall 472 through the thresh matcher. */ 473 /* we don't free membership; we flatten it in order to use in lp_split */ 474 475 for(i=0;i<entries;i++){ 476 long head=firsthead[i]; 477 spinnit("rearranging membership cache... ",entries-i); 478 while(head!=-1){ 479 long next=membership[head]; 480 membership[head]=i; 481 head=next; 482 } 483 } 484 485 free(secondhead); 486 free(firsthead); 487 free(cellerror); 488 free(cellerrormax); 489 free(secondary); 490 491 pointindex=_ogg_malloc(points*sizeof(long)); 492 /* make a point index of fall-through points */ 493 for(i=0;i<points;i++){ 494 int best=_best(b,pointlist+i*dim,1); 495 if(best==-1) 496 pointindex[indexedpoints++]=i; 497 spinnit("finding orphaned points... ",points-i); 498 } 499 500 /* make an entry index */ 501 entryindex=_ogg_malloc(entries*sizeof(long)); 502 target=0; 503 for(i=0;i<entries;i++){ 504 if(b->c->lengthlist[i]>0) 505 entryindex[target++]=i; 506 } 507 508 /* make working space for a reverse entry index */ 509 reventry=_ogg_malloc(entries*sizeof(long)); 510 511 /* do the split */ 512 nt=b->c->nearest_tree= 513 _ogg_calloc(1,sizeof(encode_aux_nearestmatch)); 514 515 nt->alloc=4096; 516 nt->ptr0=_ogg_malloc(sizeof(long)*nt->alloc); 517 nt->ptr1=_ogg_malloc(sizeof(long)*nt->alloc); 518 nt->p=_ogg_malloc(sizeof(long)*nt->alloc); 519 nt->q=_ogg_malloc(sizeof(long)*nt->alloc); 520 nt->aux=0; 521 522 fprintf(stderr,"Leaves added: %d \n", 523 lp_split(pointlist,points, 524 b,entryindex,target, 525 pointindex,indexedpoints, 526 membership,reventry, 527 0,&pointssofar)); 528 free(membership); 529 free(reventry); 530 free(pointindex); 531 532 /* hack alert. I should just change the damned splitter and 533 codebook writer */ 534 for(i=0;i<nt->aux;i++)nt->p[i]*=dim; 535 for(i=0;i<nt->aux;i++)nt->q[i]*=dim; 536 537 /* recount hits. Build new lengthlist. reuse entryindex storage */ 538 for(i=0;i<entries;i++)entryindex[i]=1; 539 for(i=0;i<points-entries;i++){ 540 int best=_best(b,pointlist+i*dim,1); 541 float *a=pointlist+i*dim; 542 if(!(i&0xff))spinnit("counting hits...",i); 543 if(best==-1){ 544 fprintf(stderr,"\nINTERNAL ERROR; a point count not be matched to a\n" 545 "codebook entry. The new decision tree is broken.\n"); 546 exit(1); 547 } 548 entryindex[best]++; 549 } 550 for(i=0;i<nt->aux;i++)nt->p[i]/=dim; 551 for(i=0;i<nt->aux;i++)nt->q[i]/=dim; 552 553 /* the lengthlist builder doesn't actually deal with 0 hit entries. 554 So, we pack the 'sparse' hit list into a dense list, then unpack 555 the lengths after the build */ 556 { 557 int upper=0; 558 long *lengthlist=_ogg_calloc(entries,sizeof(long)); 559 for(i=0;i<entries;i++){ 560 if(b->c->lengthlist[i]>0) 561 entryindex[upper++]=entryindex[i]; 562 else{ 563 if(entryindex[i]>1){ 564 fprintf(stderr,"\nINTERNAL ERROR; _best matched to unused entry\n"); 565 exit(1); 566 } 567 } 568 } 569 570 /* sanity check */ 571 if(upper != target){ 572 fprintf(stderr,"\nINTERNAL ERROR; packed the wrong number of entries\n"); 573 exit(1); 574 } 575 576 build_tree_from_lengths(upper,entryindex,lengthlist); 577 578 upper=0; 579 for(i=0;i<entries;i++){ 580 if(b->c->lengthlist[i]>0) 581 b->c->lengthlist[i]=lengthlist[upper++]; 582 } 583 584 } 585 } 586 /* we're done. write it out. */ 587 write_codebook(out,basename,b->c); 588 589 fprintf(stderr,"\r \nDone.\n"); 590 return(0); 591} 592 593 594 595 596