1/*
2 * adjustlogs.c:  Program to take multiple alog logfiles, extract events
3 *                for synchronizing the clocks, and generating adjusted times.
4 *                The files are replaced, allowing the use of other alog tools.
5 *
6 * -e n defines synchronization events
7 * -a1 n -a2 m -b1 k define pair-exchange events used to compute clock offsets
8 * (There are predefined values; these allow the user to define their own)
9 *
10 * Algorithm:
11 *     Build a matrix of time events; solve it for the offset and skew for
12 *     each clock.  For the first pass, this "matrix" will have just the
13 *     "synchronization" events.
14 *
15 * This is the formula:
16 * Processor 0 has the standard clock.
17 * At the end of each sync, the clock are re-synchronized.
18 * Thus, the global time for processor p is
19 *    Find the interval I in synctime that contains the local time
20 *    The adjusted gtime is:
21 *
22 *            stime[0][I+1]-stime[0][I]
23 *    gtime = ------------------------- (time - stime[p][I]) + stime[0][I]
24 *            stime[p][I+1]-stime[p][I]
25 *
26 * The current implementation uses a single interval.
27 *
28 * Just to keep things more interesting, the timer is really a 64 bit clock,
29 * with the field "time_slot" containing the high bits.
30 *
31 */
32#include <stdio.h>
33#include <ctype.h>           /* for calling isdigit()             */
34#include "alog_evntdfs.h"       /* Logfile definitions */
35
36#define FALSE 0
37#define TRUE !FALSE
38#define MAX(x,y)	( (x) > (y) ? (x) : (y) )
39
40#define C_DATA_LEN 50
41
42#define DO_NEGATIVE 1
43#define IGNORE_NEGATIVE 2
44
45struct log_entry
46{
47	int proc_id;
48	int task_id;
49	int event;
50	int i_data;
51	char c_data[C_DATA_LEN];
52	int time_slot;
53	unsigned long time;
54};
55
56#define MAX_NSYNC 2
57typedef struct {
58    unsigned long *time;            /* time values that were recorded */
59    } SyncTime;
60SyncTime synctime[MAX_NSYNC];
61
62/* For now, we just handle a set of timing events (np-1 of them)
63   between processor i and i+1 (processor 0 participates in only
64   1 event) */
65typedef struct {
66    unsigned long a1, b1, a2;         /* Times for the events */
67    int           p0, p1;             /* processors that participated in
68					 this time-exchange */
69    } OffsetEvents;
70OffsetEvents *offsetevents;
71int noffsetevents = 0;
72
73/* A local offset is used to compensate for the time_slot (upper 32 bits) */
74/* NOT YET IMPLEMENTED */
75long local_offset;
76
77/* The global time is found by adding an offset and scaling by
78   a fraction that is represented by numer[i]/denom[i] on the i'th
79   processor */
80unsigned long *numer;
81unsigned long *denom;
82long          *globaloffset;
83
84/* mintime holds the mintime for ALL runs; this can be used to
85   offset the values */
86unsigned long mintime;
87
88/* These hold user-defined synchronization events */
89#define MAX_USERETYPES 100
90static int syncevent[MAX_USERETYPES];
91static int syncep=0;
92
93/* These hold the 3 event types used to adjust the individual offsets
94   (if not present, the synchronization events are used to compute the
95   offsets)
96 */
97static int a1event[MAX_USERETYPES],
98           a2event[MAX_USERETYPES],
99           b1event[MAX_USERETYPES];
100static int a1p = 0, a2p = 0, b1p = 0;
101
102static unsigned long lowmask = 0xFFFF;
103
104void ComputeOffsets();
105
106main(argc,argv)
107int argc;
108char *argv[];
109{
110	FILE *headerfp, *fd;
111	int  np, i, nsync, nlsync;
112	char headerfile[255];
113	int pid;
114        int firstfile;
115
116	if ( argc <= 1 )
117		usage( argv[0] );
118
119	/* Look for user-defined events */
120	for (i=1; i<argc; i++) {
121	    if (strcmp(argv[i],"-e") == 0)
122		/* Test on MAX_USERTYPES */
123		syncevent[syncep++] = atoi(argv[++i]);
124	    else if (strcmp(argv[i],"-a1") == 0)
125	    	a1event[a1p++] = atoi(argv[++i]);
126	    else if (strcmp(argv[i],"-a2") == 0)
127	    	a2event[a2p++] = atoi(argv[++i]);
128	    else if (strcmp(argv[i],"-b1") == 0)
129	    	b1event[b1p++] = atoi(argv[++i]);
130	    else
131		break;
132	    }
133	/* Figure out how many processors there are */
134	np        = argc - i;
135	firstfile = i;
136	/* These could be allocated on demand */
137	for (i=0; i<MAX_NSYNC; i++) {
138	    synctime[i].time       = (unsigned long *)
139		                malloc( np * sizeof(unsigned long) );
140	    }
141
142	globaloffset = (long *) malloc( np * sizeof(long) );
143	numer        = (unsigned long *) malloc( np * sizeof(unsigned long) );
144	denom        = (unsigned long *) malloc( np * sizeof(unsigned long) );
145	offsetevents = (OffsetEvents *) malloc( np * sizeof(OffsetEvents) );
146	mintime      = (unsigned long)(~0);
147
148	/* Loop through each file, looking for the synchronization events */
149	for (i=0; i<np; i++) {
150	    fd = fopen( argv[firstfile+i], "r" );
151	    nsync = extract_timing( i, fd );
152	    if (i > 0 && nsync != nlsync) {
153		fprintf( stderr, "Found differing numbers of syncs\n" );
154		exit(0);
155		}
156	    nlsync = nsync;
157	    fclose( fd );
158	    }
159	/* If we didn't find enough events, we exit */
160	if (nsync < 2) {
161	    fprintf( stderr,
162		     "Not enough synchronization events to adjust logs\n" );
163	    exit(0);
164	    }
165
166	/* Compute a "global clock" time */
167	/* NOTE: if numer is changed, ComputeOffsets must be changed as well */
168	for (i=0; i<np; i++) {
169	    numer[i]        = (synctime[1].time[0] - synctime[0].time[0]);
170	    denom[i]        = (synctime[1].time[i] - synctime[0].time[i]);
171	    /* Using mintime here fails for some log files (since some of the
172	       computed/scaled times can then be negative.  We have to pick
173	       a value that makes the minimum COMPUTED time positive */
174	    globaloffset[i] = synctime[0].time[i]; /*   - mintime; */
175	    }
176	fprintf( stderr, "Summary of clock transformations:\n" );
177	if (noffsetevents == np - 1) {
178	    /* Print out the initial globaloffsets */
179	    fprintf( stderr, "Global offsets from sync events are:\n" );
180	    for (i=0; i<np; i++) {
181		fprintf( stderr, "%4d  %12ld\n", i, globaloffset[i] );
182		}
183	    }
184
185	/* Use adjust events to compute a modified offset (if such events
186	   are not present, the globaloffset values above will be used) */
187	ComputeOffsets( np );
188
189	/* Write a summary */
190	for (i=0; i<np; i++) {
191	    fprintf( stderr, "%4d  (t - %12ld) (%lu/%lu)\n",
192		     i, globaloffset[i], numer[i], denom[i] );
193	    }
194
195	/* Rewrite the log files using the clock adjustment */
196	for (i=0; i<np; i++) {
197	    pid = getpid();
198	    sprintf( headerfile, "%s.new", argv[firstfile+i] );
199/* 	    sprintf(headerfile,"log.header.%d",pid); */
200	    if ( (headerfp=fopen(headerfile,"w")) == NULL ) {
201		fprintf(stderr,"%s: unable to create temp file %s.\n",
202			argv[0], headerfile );
203		exit(0);
204		}
205	    fd = fopen( argv[firstfile+i], "r" );
206	    if (!fd) {
207		fprintf( stderr, "%s: Unable to open log file %s\n",
208			 argv[0], argv[firstfile+i] );
209		exit(0);
210		}
211	    adjust_file( i, fd, headerfp, 0, nsync, argv[firstfile+i] );
212	    fclose( fd );
213	    fclose( headerfp );
214
215	    /* move filename */
216/* 	    unlink( argv[firstfile+i] );
217	    link( headerfile, argv[firstfile+i] );
218	    unlink( headerfile );  */
219	    }
220
221} /* main */
222
223/*
224   Extract timing data for the i'th log file
225 */
226int extract_timing( i, fd )
227int  i;
228FILE *fd;
229{
230struct log_entry entry;
231int    nsync = -1;
232
233while (1) {
234    read_logentry(fd,&entry,DO_NEGATIVE);
235    if ( feof(fd) ) break;
236    if (is_sync_event(entry.event)) {
237	/* We do this so that we save the LAST sync event */
238	if (nsync + 1 < MAX_NSYNC) nsync++;
239	synctime[nsync].time[i] = entry.time;
240	/* fprintf( stdout, "Event type %d at time %d on proc %d\n",
241		 entry.event, entry.time, i ); */
242	}
243	/* For the offset events, the assumption is that each processor
244	   (except for processor 0) is the ORIGINATOR of one offsetevent.
245	   It MAY participate as the respondent (b1 event) for multiple
246	   events, including having processor 0 respond to EVERYONE.
247	   Finally, the (b1) processor has processor number SMALLER than
248	   the (a1,a2) processor.  This makes the equations that need
249	   to be solved for the offsets TRIANGULAR and easy.
250	 */
251    else if (is_a1_event(entry.event)) {
252    	offsetevents[i].a1 = entry.time;
253    	offsetevents[i].p0 = entry.i_data;
254        }
255    else if (is_a2_event(entry.event)) {
256    	offsetevents[i].a2 = entry.time;
257    	offsetevents[i].p0 = entry.i_data;
258    	noffsetevents++;
259        }
260    else if (is_b1_event(entry.event)) {
261    	if (entry.i_data < i) {
262    	    fprintf( stderr,
263	             "Improper offset event (originating processor %d\n", i );
264    	    fprintf( stderr, "higher numbered than partner %d)\n",
265		     entry.i_data );
266    	    exit(0);
267    	    }
268    	offsetevents[entry.i_data].b1 = entry.time;
269    	offsetevents[entry.i_data].p1 = i;
270        }
271    else if (entry.event > 0) {
272	if (mintime > entry.time) mintime = entry.time;
273	}
274    }
275return nsync + 1;
276}
277
278adjust_file( p, fin, fout, leave_events, nsync, fname )
279FILE *fin, *fout;
280int  p, leave_events, nsync;
281char *fname;
282{
283struct log_entry entry;
284unsigned long GlobalTime(), gtime;
285unsigned long lasttime;
286
287/* lasttime is used to make sure that we don't mess up the log files without
288   knowing it */
289lasttime = 0;
290while (1) {
291    read_logentry(fin,&entry,DO_NEGATIVE);
292    if ( feof(fin) ) break;
293    if (!leave_events && (entry.event == ALOG_EVENT_SYNC ||
294			  entry.event == ALOG_EVENT_PAIR_A1 ||
295			  entry.event == ALOG_EVENT_PAIR_A2 ||
296			  entry.event == ALOG_EVENT_PAIR_B1)) continue;
297    /* adjust to the global clock time */
298    gtime = GlobalTime(entry.time,p,nsync);
299    if (entry.event > 0) {
300	if (gtime < lasttime) {
301	    fprintf( stderr, "Error computing global times\n" );
302	    fprintf( stderr, "Times are not properly sorted\n" );
303	    fprintf( stderr, "Last time was %lu, current time is %lu\n",
304		     lasttime, gtime );
305	    fprintf( stderr, "(original new time is %lu)\n", entry.time );
306	    fprintf( stderr, "processing file %s\n", fname );
307	    exit(0);
308	    }
309	else
310	    lasttime = gtime;
311	}
312    /* negative events are unchanged. */
313    fprintf(fout,"%d %d %d %d %d %lu %s\n",entry.event,
314	    entry.proc_id,entry.task_id,entry.i_data,
315	    entry.time_slot, (entry.event >= 0) ? gtime : entry.time,
316	    entry.c_data);
317    }
318}
319
320usage( a )
321char *a;
322{
323	fprintf(stderr,"%s: %s infile1 infile2 ...\n",a,a);
324	fprintf(stderr,"  updates files with synchronized clocks\n");
325
326	exit(0);
327}
328
329read_logentry(fp,table,do_negs)
330FILE *fp;
331struct log_entry *table;
332int do_negs;
333{
334	char buf[81];
335	char *cp;
336
337	int i;
338
339	do
340	{
341		fscanf(fp,"%d %d %d %d %d %lu",
342			&(table->event),&(table->proc_id),&(table->task_id),
343			&(table->i_data),&(table->time_slot),&(table->time));
344
345		cp = table->c_data;
346
347		i = 0;
348
349		do
350		{
351			fscanf(fp,"%c",cp);
352		}
353		while ( *cp == ' ' || *cp == '\t' );
354
355		i++;
356
357		while ( *cp != '\n' && i < C_DATA_LEN )
358		{
359			fscanf(fp,"%c",++cp);
360
361			i++;
362		}
363
364		*cp = '\0';
365
366		/*
367		if ( !feof(fp) && table->event == 0 )
368			fprintf(stderr,"0 reading in.\n");
369		*/
370	}
371	while( table->event < 0 && do_negs == IGNORE_NEGATIVE && !feof(fp) );
372}
373
374/* This routine allows use to define MANY sync events */
375int is_sync_event( type )
376int type;
377{
378int i;
379
380if (type == ALOG_EVENT_SYNC) return 1;
381for (i=0; i<syncep; i++)
382    if (type == syncevent[i]) return 1;
383return 0;
384}
385
386int is_a1_event( type )
387int type;
388{
389int i;
390
391if (type == ALOG_EVENT_PAIR_A1) return 1;
392for (i=0; i<a1p; i++)
393    if (type == a1event[i]) return 1;
394return 0;
395}
396
397int is_a2_event( type )
398int type;
399{
400int i;
401
402if (type == ALOG_EVENT_PAIR_A2) return 1;
403for (i=0; i<a2p; i++)
404    if (type == a2event[i]) return 1;
405return 0;
406}
407
408int is_b1_event( type )
409int type;
410{
411int i;
412
413if (type == ALOG_EVENT_PAIR_B1) return 1;
414for (i=0; i<b1p; i++)
415    if (type == b1event[i]) return 1;
416return 0;
417}
418
419unsigned long GlobalTime( time, p, nsync )
420unsigned long time;
421int           p, nsync;
422{
423unsigned long gtime, stime1, stime2;
424unsigned long frac;
425unsigned long tdiff;
426unsigned long ScaleLong();
427
428/* Problem: since times are UNSIGNED, we have to be careful about how they
429   are adjusted.  time - synctime may not be positive.  We make sure that
430   all of the subexpressions are unsigned longs */
431if (time >= globaloffset[p]) {
432    tdiff = time - globaloffset[p];
433    frac  = ScaleLong( numer[p], denom[p], tdiff );
434    gtime = frac + globaloffset[0];
435    }
436else {
437    tdiff = globaloffset[p] - time;
438    frac  = ScaleLong( numer[p], denom[p], tdiff );
439    if (frac > globaloffset[0]) printf( "Oops!\n" );
440    gtime = globaloffset[0] - frac;
441    }
442return gtime;
443}
444
445/*
446    This routine takes offset events and solves for the offsets.  The
447    approach is:
448
449    Let the global time be given by (local_time - offset)*scale ,
450    with a different offset and scale on each processor.  Each processor
451    originates exactly one communication event (except processor 0),
452    generating an a1 and a2 event.  A corresponding number of b2 events
453    are generated, but note that one processor may have more than 1 b2
454    event (if using Dunnigan's synchronization, there will be np-1 b2 events
455    on processor 0, and none anywhere else).
456
457    These events are:
458
459   pi   a1 (send to nbr)                        (recv) a2
460   pj                     (recv) b1 (send back)
461
462    We base the analysis on the assumption that in the GLOBAL time
463    repreresentation, a2-a1 is twice the time to do a (send) and
464    a (recv).  This is equivalent to assuming that global((a1+a2)/2) ==
465    global(b1).  Then, with the unknowns the offsets (the scales
466    are assumed known from the syncevent calculation), the matrix is
467
468    1
469    -s0 s1
470       ....
471       -sj ... si
472
473    where si is the scale for the i'th processor (note s0 = 1).
474    The right hand sides are (1/2)(a1(i)+a2(i)) *s(i) - b1(j)*s(j).
475    Because of the triangular nature of the matrix, this reduces to
476
477       o(i) = (a1(i)+a2(i))/2 - (s(j)/s(i)) * (b1(j)-o(j))
478
479    Note that if s(i)==s(j) and b1 == (a1+a2)/2, this gives o(i)==o(j).
480
481 */
482void ComputeOffsets( np )
483int np;
484{
485int i, j;
486unsigned long d1, delta;
487unsigned long ScaleLong();
488
489/* If there aren't enough events, return */
490if (noffsetevents != np - 1) {
491    if (noffsetevents != 0)
492	fprintf( stderr,
493	   "Incorrect number of offset events to compute clock offsets\n" );
494    else
495	fprintf( stderr, "No clock offset events\n" );
496    return;
497    }
498
499/* Take globaloffset[0] from sync */
500for (i=1; i<np; i++) {
501    /* o(i) = (a1(i)+a2(i))/2 - (s(j)/s(i)) * (b1(j)-o(j)) */
502    j  = offsetevents[i].p1;
503
504    /* Compute a1(i)+a2(i)/2.  Do this by adding half the difference;
505       this insures that we avoid overflow */
506    d1 = (offsetevents[i].a2 - offsetevents[i].a1)/2;
507    d1 = offsetevents[i].a1 + d1;
508
509    /* We form (b1-o(j))(s(j)/s(i)) by noting that
510       s(j)/s(i) == denom(i)/denom(j) (since numer(i)==numer(j)) */
511    delta = ScaleLong( denom[i], denom[j],
512                       offsetevents[i].b1 - globaloffset[j] );
513
514    globaloffset[i] = d1 - delta;
515    }
516}
517
518#include <mp.h>
519static MINT *prod, *qq, *rr;
520static int  mpallocated = 0;
521
522unsigned long ScaleLong( n, d, v )
523unsigned long n, d, v;
524{
525char buf[40];
526char *s;
527MINT *nn, *dd, *vv;
528unsigned long q, r;
529
530if (!mpallocated) {
531    prod = itom(0);
532    if (!prod) {
533	fprintf( stderr, "Could not allocate mp int\n" );
534	exit(0);
535	}
536    qq   = itom(0);
537    if (!qq) {
538	fprintf( stderr, "Could not allocate mp int\n" );
539	exit(0);
540	}
541    rr   = itom(0);
542    if (!rr) {
543	fprintf( stderr, "Could not allocate mp int\n" );
544	exit(0);
545	}
546    mpallocated = 1;
547    }
548
549sprintf( buf, "%x", n );
550nn = xtom(buf);
551if (!nn) {
552    fprintf( stderr, "Could not allocate mp int\n" );
553    exit(0);
554    }
555sprintf( buf, "%x", v );
556vv = xtom(buf);
557if (!vv) {
558    fprintf( stderr, "Could not allocate mp int\n" );
559    exit(0);
560    }
561sprintf( buf, "%x", d );
562dd = xtom(buf);
563if (!dd) {
564    fprintf( stderr, "Could not allocate mp int\n" );
565    exit(0);
566    }
567
568mult(nn,vv,prod);
569mdiv(prod,dd,qq,rr);
570s = mtox(qq);
571sscanf( s, "%x", &q );
572free( s );
573s = mtox(rr);
574sscanf( s, "%x", &r );
575free( s );
576
577/* Free the locals */
578mfree( nn );
579mfree( dd );
580mfree( vv );
581
582return q;
583}
584
585
586/* Here is not-quite working code for multiple precision arithmetic */
587#ifdef DO_MULTIPLE_ARITH
588
589/*
590   This routine takes a value v and scales it by (n/d).  This
591   routine handles integer overflow by using the following formulas:
592   Let h(u) = high 16 bits of u, and l(u) = low 16 bits of u.
593   Then v *(n/d) =
594
595   (l(v)+h(v))*(l(u)+h(u))/d
596   = l(v)l(n)/d + (l(n)h(v)+l(v)h(n))/d + h(v)h(n)/d
597
598   == a1/d + a2/d + a3/d
599
600   In order to keep the values in-range, we define low(u)=l(u) and
601   high(u) = h(u) >> 16.  Then this formula becomes (with high substituted
602   for h):
603
604   a1/d + (a2<<16)/d + (a3<<32)/d
605
606   Now, when doing the integer division, we need to propagate the remainders.
607   Let the result be r.  Then
608
609   rd = a1 + (a2<<16) + (a3<<32)
610
611   if a1 = k1 d + b1, (a2 << 16) = (k2 d + b2), and (a3 << 32) = (k3 d + b3),
612   then
613
614   r d = (k1 d + b1) + (k2 d + b2) + (k3 d + b3);
615       = (k1 + k2 + k3) d + (b1 + b2 + b3)
616
617   and so
618
619   r   = (k1 + k2 + k3) + (b1 + b2 + b3) / d
620
621   To compute (k2,b2) and (k3,b3), we do:
622
623   (a2<<16)/d:
624
625   a2 = p2 d + c2
626   a2 << 16 = p2 d << 16 + c2 << 16
627            = (p2 << 16) d + c2 << 16
628   Let c2 << 16 = r2 d + s2, then (finally!)
629   a2 << 16 = (p2 << 16 + r2) d + s2
630
631   (a3 << 32)/d:
632
633   a3 = p3 d + c3
634   a3 << 32 = p3 d << 32 + c3 << 32
635            = (p3 << 32) d + c3 << 32
636   Computing c3 << 32 = r3 d + s3 is a challange, particularly
637   since we need only the low 32 bits (the high 32 bits will be 0)
638   We do this in stages as well:
639
640   c3 << 32 = (c3 << 16) << 16;
641   (c3 << 16) = t3 d + u3
642   (c3 << 32) = (t3 << 16)d + u3 << 16
643              = (t3 << 16 + y3)d + z3,
644	      == r3 d + s3
645   where u3 << 16 = y3 d + z3
646
647   Then
648   a3 << 32 = (p3 << 32 + r3) d + s3
649 */
650void DivLong();
651
652/*
653   ScaleDecomp - convert (a << p) = alpha d + beta, with beta < d
654
655   This works by recursively:
656   a = b d + r,
657   a<<p = (b << p)d + (r<<p)
658   then process r<<p to bd + r' etc until b == 0
659 */
660void ScaleDecomp( a, p, d, alpha, beta )
661int           p;
662unsigned long a, d, *alpha, *beta;
663{
664unsigned long b, r;
665unsigned long Alpha, Beta;
666int      p1;
667
668Alpha = 0; Beta = 0;
669
670b     = a / d;
671r     = a % d;
672Alpha = b << p;
673/* We need to gingerly deal with r, since shifting it by much
674   may make it too large, particularly if d is nearly 32 bits.
675   What we need is r << p = gamma d + delta, with r < d.  This
676   is really the hard part. We can not assume that d is much
677   smaller than 32 bits, so this is tricky. */
678
679DivLong( r, d, (unsigned long)(1 << p), &b, &r );
680Alpha += b;
681*beta = r;
682#ifdef FOO
683while (p > 1 && r > 0) {
684    p1    = p/2;
685    r     = (r << p1);
686    b     = r / d;
687    r     = r % d;
688    Alpha += b << (p-p1);
689    p      = (p - p1);
690    }
691*alpha = Alpha;
692*beta  = r << p;
693#endif
694}
695#define LOWBITS(a) (unsigned long)((a)&lowmask)
696#define HIGHBITS(a) (unsigned long)( ((a) >> 16 ) & lowmask )
697
698#include <mp.h>
699unsigned long ScaleLong( n, d, v )
700unsigned long n, d, v;
701{
702#ifdef FOO
703#define LOWBITS(a) (unsigned long)((a)&lowmask)
704#define HIGHBITS(a) (unsigned long)( ((a) >> 16 ) & lowmask )
705unsigned long a1, a21, a22, a3, k1, k21, k22, k3, b1, b21, b22, b3;
706
707DivLong( n, d, v, &k1, &b1 );
708return k1 + b1/d;
709
710a1  = LOWBITS(v)*LOWBITS(n);
711a21 = LOWBITS(v)*HIGHBITS(n);
712a22 = LOWBITS(n)*HIGHBITS(v);
713a3  = HIGHBITS(v)*HIGHBITS(n);
714
715k1 = a1 / d;
716b1 = a1 % d;
717
718ScaleDecomp( a21, 16, d, &k21, &b21 );
719ScaleDecomp( a22, 16, d, &k22, &b22 );
720ScaleDecomp( a3,  32, d, &k3,  &b3 );
721
722return (k1 + k21 + k22 + k3) + (b1 + b21 + b22 + b3) / d;
723#else
724char buf[40];
725MINT *nn, *dd, *vv, *prod, *qq, *rr;
726unsigned long q, r;
727
728sprintf( buf, "%x", n );
729nn = xtom(buf);
730sprintf( buf, "%x", v );
731vv = xtom(buf);
732sprintf( buf, "%x", d );
733dd = xtom(buf);
734prod = itom(0);
735qq   = itom(0);
736rr   = itom(0);
737mult(nn,vv,prod);
738mdiv(prod,dd,qq,rr);
739sscanf( mtox(qq), "%x", &q );
740sscanf( mtox(rr), "%x", &r );
741return q;
742#endif
743}
744
745/*
746  Represent nv = alpha d + beta
747 */
748void DivLong( n, d, v, alpha, beta )
749unsigned long n, d, v;
750unsigned long *alpha, *beta;
751{
752unsigned long a1, a21, a22, a3, k1, k21, k22, k3, b1, b21, b22, b3;
753
754a1  = LOWBITS(v)*LOWBITS(n);
755a21 = LOWBITS(v)*HIGHBITS(n);
756a22 = LOWBITS(n)*HIGHBITS(v);
757a3  = HIGHBITS(v)*HIGHBITS(n);
758
759k1 = a1 / d;
760b1 = a1 % d;
761
762ScaleDecomp( a21, 16, d, &k21, &b21 );
763ScaleDecomp( a22, 16, d, &k22, &b22 );
764ScaleDecomp( a3,  32, d, &k3,  &b3 );
765
766*alpha = k1 + k21 + k22 + k3;
767*beta  = b1 + b21 + b22 + b3;
768}
769
770#endif
771