14887Schin /*********************************************************************** 24887Schin * * 34887Schin * This software is part of the ast package * 4*8462SApril.Chin@Sun.COM * Copyright (c) 1985-2008 AT&T Intellectual Property * 54887Schin * and is licensed under the * 64887Schin * Common Public License, Version 1.0 * 7*8462SApril.Chin@Sun.COM * by AT&T Intellectual Property * 84887Schin * * 94887Schin * A copy of the License is available at * 104887Schin * http://www.opensource.org/licenses/cpl1.0.txt * 114887Schin * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 124887Schin * * 134887Schin * Information and Software Systems Research * 144887Schin * AT&T Research * 154887Schin * Florham Park NJ * 164887Schin * * 174887Schin * Glenn Fowler <gsf@research.att.com> * 184887Schin * David Korn <dgk@research.att.com> * 194887Schin * Phong Vo <kpv@research.att.com> * 204887Schin * * 214887Schin ***********************************************************************/ 224887Schin #pragma prototyped 234887Schin 244887Schin /* 254887Schin * determine record format by sampling data in <buf,size> 264887Schin * total is the total file size, <=0 if not available 274887Schin * return r: 284887Schin * -1 could not determine 294887Schin * RECTYPE(r)==REC_fixed fixed length REC_F_SIZE(r) 304887Schin * RECTYPE(r)==REC_delimited variable length delimiter=REC_D_DELIMITER(r) 314887Schin * RECTYPE(r)==REC_variable variable length 324887Schin */ 334887Schin 344887Schin #include <recfmt.h> 354887Schin 364887Schin typedef struct 374887Schin { 384887Schin unsigned int rep[4 * 1024]; 394887Schin unsigned int hit[UCHAR_MAX + 1]; 404887Schin } Sample_t; 414887Schin 424887Schin Recfmt_t 434887Schin recfmt(const void* buf, size_t size, off_t total) 444887Schin { 454887Schin register unsigned char* s; 464887Schin register unsigned char* t; 474887Schin register Sample_t* q; 484887Schin register unsigned int* h; 494887Schin register unsigned int i; 504887Schin unsigned int j; 514887Schin unsigned int k; 524887Schin unsigned int n; 534887Schin unsigned int m; 544887Schin unsigned int x; 554887Schin unsigned long f; 564887Schin unsigned long g; 574887Schin 584887Schin static unsigned char terminators[] = { '\n', 0x15, 0x25 }; 594887Schin 604887Schin /* 614887Schin * check for V format 624887Schin */ 634887Schin 644887Schin s = (unsigned char*)buf; 654887Schin t = s + size; 664887Schin while ((k = (t - s)) >= 4 && !s[2] && !s[3]) 674887Schin { 684887Schin if ((i = (s[0]<<8)|s[1]) > k) 694887Schin break; 704887Schin s += i; 714887Schin } 724887Schin if (!k || size > 2 * k) 734887Schin return REC_V_TYPE(4, 0, 2, 0, 1); 744887Schin s = (unsigned char*)buf; 754887Schin 764887Schin /* 774887Schin * check for terminated records 784887Schin */ 794887Schin 804887Schin for (i = 0; i < elementsof(terminators); i++) 814887Schin if ((t = (unsigned char*)memchr((void*)s, k = terminators[i], size / 2)) && (n = t - s + 1) > 1 && (total <= 0 || !(total % n))) 824887Schin { 834887Schin for (j = n - 1; j < size; j += n) 844887Schin if (s[j] != k) 854887Schin { 864887Schin n = 0; 874887Schin break; 884887Schin } 894887Schin if (n) 904887Schin return REC_D_TYPE(terminators[i]); 914887Schin } 924887Schin 934887Schin /* 944887Schin * check fixed length record frequencies 954887Schin */ 964887Schin 974887Schin if (!(q = newof(0, Sample_t, 1, 0))) 984887Schin return REC_N_TYPE(); 994887Schin x = 0; 1004887Schin for (i = 0; i < size; i++) 1014887Schin { 1024887Schin h = q->hit + s[i]; 1034887Schin m = i - *h; 1044887Schin *h = i; 1054887Schin if (m < elementsof(q->rep)) 1064887Schin { 1074887Schin if (m > x) 1084887Schin x = m; 1094887Schin q->rep[m]++; 1104887Schin } 1114887Schin } 1124887Schin n = 0; 1134887Schin m = 0; 1144887Schin f = ~0; 1154887Schin for (i = x; i > 1; i--) 1164887Schin { 1174887Schin if ((total <= 0 || !(total % i)) && q->rep[i] > q->rep[n]) 1184887Schin { 1194887Schin m++; 1204887Schin g = 0; 1214887Schin for (j = i; j < size - i; j += i) 1224887Schin for (k = 0; k < i; k++) 1234887Schin if (s[j + k] != s[j + k - i]) 1244887Schin g++; 1254887Schin g = (((g * 100) / i) * 100) / q->rep[i]; 1264887Schin if (g <= f) 1274887Schin { 1284887Schin f = g; 1294887Schin n = i; 1304887Schin } 1314887Schin } 1324887Schin } 1334887Schin if (m <= 1 && n <= 2 && total > 1 && total < 256) 1344887Schin { 1354887Schin n = 0; 1364887Schin for (i = 0; i < size; i++) 1374887Schin for (j = 0; j < elementsof(terminators); j++) 1384887Schin if (s[i] == terminators[j]) 1394887Schin n++; 1404887Schin n = n ? 0 : total; 1414887Schin } 1424887Schin free(q); 1434887Schin return n ? REC_F_TYPE(n) : REC_N_TYPE(); 1444887Schin } 1454887Schin 1464887Schin #if MAIN 1474887Schin 1484887Schin main() 1494887Schin { 1504887Schin void* s; 1514887Schin size_t size; 1524887Schin off_t total; 1534887Schin 1544887Schin if (!(s = sfreserve(sfstdin, SF_UNBOUND, 0))) 1554887Schin { 1564887Schin sfprintf(sfstderr, "read error\n"); 1574887Schin return 1; 1584887Schin } 1594887Schin size = sfvalue(sfstdin); 1604887Schin total = sfsize(sfstdin); 1614887Schin sfprintf(sfstdout, "%d\n", recfmt(s, size, total)); 1624887Schin return 0; 1634887Schin } 1644887Schin 1654887Schin #endif 166