1*4887Schin /*********************************************************************** 2*4887Schin * * 3*4887Schin * This software is part of the ast package * 4*4887Schin * Copyright (c) 1985-2007 AT&T Knowledge Ventures * 5*4887Schin * and is licensed under the * 6*4887Schin * Common Public License, Version 1.0 * 7*4887Schin * by AT&T Knowledge Ventures * 8*4887Schin * * 9*4887Schin * A copy of the License is available at * 10*4887Schin * http://www.opensource.org/licenses/cpl1.0.txt * 11*4887Schin * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 12*4887Schin * * 13*4887Schin * Information and Software Systems Research * 14*4887Schin * AT&T Research * 15*4887Schin * Florham Park NJ * 16*4887Schin * * 17*4887Schin * Glenn Fowler <gsf@research.att.com> * 18*4887Schin * David Korn <dgk@research.att.com> * 19*4887Schin * Phong Vo <kpv@research.att.com> * 20*4887Schin * * 21*4887Schin ***********************************************************************/ 22*4887Schin #pragma prototyped 23*4887Schin 24*4887Schin /* 25*4887Schin * determine record format by sampling data in <buf,size> 26*4887Schin * total is the total file size, <=0 if not available 27*4887Schin * return r: 28*4887Schin * -1 could not determine 29*4887Schin * RECTYPE(r)==REC_fixed fixed length REC_F_SIZE(r) 30*4887Schin * RECTYPE(r)==REC_delimited variable length delimiter=REC_D_DELIMITER(r) 31*4887Schin * RECTYPE(r)==REC_variable variable length 32*4887Schin */ 33*4887Schin 34*4887Schin #include <recfmt.h> 35*4887Schin 36*4887Schin typedef struct 37*4887Schin { 38*4887Schin unsigned int rep[4 * 1024]; 39*4887Schin unsigned int hit[UCHAR_MAX + 1]; 40*4887Schin } Sample_t; 41*4887Schin 42*4887Schin Recfmt_t 43*4887Schin recfmt(const void* buf, size_t size, off_t total) 44*4887Schin { 45*4887Schin register unsigned char* s; 46*4887Schin register unsigned char* t; 47*4887Schin register Sample_t* q; 48*4887Schin register unsigned int* h; 49*4887Schin register unsigned int i; 50*4887Schin unsigned int j; 51*4887Schin unsigned int k; 52*4887Schin unsigned int n; 53*4887Schin unsigned int m; 54*4887Schin unsigned int x; 55*4887Schin unsigned long f; 56*4887Schin unsigned long g; 57*4887Schin 58*4887Schin static unsigned char terminators[] = { '\n', 0x15, 0x25 }; 59*4887Schin 60*4887Schin /* 61*4887Schin * check for V format 62*4887Schin */ 63*4887Schin 64*4887Schin s = (unsigned char*)buf; 65*4887Schin t = s + size; 66*4887Schin while ((k = (t - s)) >= 4 && !s[2] && !s[3]) 67*4887Schin { 68*4887Schin if ((i = (s[0]<<8)|s[1]) > k) 69*4887Schin break; 70*4887Schin s += i; 71*4887Schin } 72*4887Schin if (!k || size > 2 * k) 73*4887Schin return REC_V_TYPE(4, 0, 2, 0, 1); 74*4887Schin s = (unsigned char*)buf; 75*4887Schin 76*4887Schin /* 77*4887Schin * check for terminated records 78*4887Schin */ 79*4887Schin 80*4887Schin for (i = 0; i < elementsof(terminators); i++) 81*4887Schin if ((t = (unsigned char*)memchr((void*)s, k = terminators[i], size / 2)) && (n = t - s + 1) > 1 && (total <= 0 || !(total % n))) 82*4887Schin { 83*4887Schin for (j = n - 1; j < size; j += n) 84*4887Schin if (s[j] != k) 85*4887Schin { 86*4887Schin n = 0; 87*4887Schin break; 88*4887Schin } 89*4887Schin if (n) 90*4887Schin return REC_D_TYPE(terminators[i]); 91*4887Schin } 92*4887Schin 93*4887Schin /* 94*4887Schin * check fixed length record frequencies 95*4887Schin */ 96*4887Schin 97*4887Schin if (!(q = newof(0, Sample_t, 1, 0))) 98*4887Schin return REC_N_TYPE(); 99*4887Schin x = 0; 100*4887Schin for (i = 0; i < size; i++) 101*4887Schin { 102*4887Schin h = q->hit + s[i]; 103*4887Schin m = i - *h; 104*4887Schin *h = i; 105*4887Schin if (m < elementsof(q->rep)) 106*4887Schin { 107*4887Schin if (m > x) 108*4887Schin x = m; 109*4887Schin q->rep[m]++; 110*4887Schin } 111*4887Schin } 112*4887Schin n = 0; 113*4887Schin m = 0; 114*4887Schin f = ~0; 115*4887Schin for (i = x; i > 1; i--) 116*4887Schin { 117*4887Schin if ((total <= 0 || !(total % i)) && q->rep[i] > q->rep[n]) 118*4887Schin { 119*4887Schin m++; 120*4887Schin g = 0; 121*4887Schin for (j = i; j < size - i; j += i) 122*4887Schin for (k = 0; k < i; k++) 123*4887Schin if (s[j + k] != s[j + k - i]) 124*4887Schin g++; 125*4887Schin g = (((g * 100) / i) * 100) / q->rep[i]; 126*4887Schin if (g <= f) 127*4887Schin { 128*4887Schin f = g; 129*4887Schin n = i; 130*4887Schin } 131*4887Schin } 132*4887Schin } 133*4887Schin if (m <= 1 && n <= 2 && total > 1 && total < 256) 134*4887Schin { 135*4887Schin n = 0; 136*4887Schin for (i = 0; i < size; i++) 137*4887Schin for (j = 0; j < elementsof(terminators); j++) 138*4887Schin if (s[i] == terminators[j]) 139*4887Schin n++; 140*4887Schin n = n ? 0 : total; 141*4887Schin } 142*4887Schin free(q); 143*4887Schin return n ? REC_F_TYPE(n) : REC_N_TYPE(); 144*4887Schin } 145*4887Schin 146*4887Schin #if MAIN 147*4887Schin 148*4887Schin main() 149*4887Schin { 150*4887Schin void* s; 151*4887Schin size_t size; 152*4887Schin off_t total; 153*4887Schin 154*4887Schin if (!(s = sfreserve(sfstdin, SF_UNBOUND, 0))) 155*4887Schin { 156*4887Schin sfprintf(sfstderr, "read error\n"); 157*4887Schin return 1; 158*4887Schin } 159*4887Schin size = sfvalue(sfstdin); 160*4887Schin total = sfsize(sfstdin); 161*4887Schin sfprintf(sfstdout, "%d\n", recfmt(s, size, total)); 162*4887Schin return 0; 163*4887Schin } 164*4887Schin 165*4887Schin #endif 166