14887Schin /***********************************************************************
24887Schin * *
34887Schin * This software is part of the ast package *
4*12068SRoger.Faulkner@Oracle.COM * Copyright (c) 1985-2010 AT&T Intellectual Property *
54887Schin * and is licensed under the *
64887Schin * Common Public License, Version 1.0 *
78462SApril.Chin@Sun.COM * by AT&T Intellectual Property *
84887Schin * *
94887Schin * A copy of the License is available at *
104887Schin * http://www.opensource.org/licenses/cpl1.0.txt *
114887Schin * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) *
124887Schin * *
134887Schin * Information and Software Systems Research *
144887Schin * AT&T Research *
154887Schin * Florham Park NJ *
164887Schin * *
174887Schin * Glenn Fowler <gsf@research.att.com> *
184887Schin * David Korn <dgk@research.att.com> *
194887Schin * Phong Vo <kpv@research.att.com> *
204887Schin * *
214887Schin ***********************************************************************/
224887Schin #pragma prototyped
234887Schin
244887Schin /*
254887Schin * determine record format by sampling data in <buf,size>
264887Schin * total is the total file size, <=0 if not available
274887Schin * return r:
284887Schin * -1 could not determine
294887Schin * RECTYPE(r)==REC_fixed fixed length REC_F_SIZE(r)
304887Schin * RECTYPE(r)==REC_delimited variable length delimiter=REC_D_DELIMITER(r)
314887Schin * RECTYPE(r)==REC_variable variable length
324887Schin */
334887Schin
344887Schin #include <recfmt.h>
354887Schin
364887Schin typedef struct
374887Schin {
384887Schin unsigned int rep[4 * 1024];
394887Schin unsigned int hit[UCHAR_MAX + 1];
404887Schin } Sample_t;
414887Schin
424887Schin Recfmt_t
recfmt(const void * buf,size_t size,off_t total)434887Schin recfmt(const void* buf, size_t size, off_t total)
444887Schin {
454887Schin register unsigned char* s;
464887Schin register unsigned char* t;
474887Schin register Sample_t* q;
484887Schin register unsigned int* h;
494887Schin register unsigned int i;
504887Schin unsigned int j;
514887Schin unsigned int k;
524887Schin unsigned int n;
534887Schin unsigned int m;
544887Schin unsigned int x;
554887Schin unsigned long f;
564887Schin unsigned long g;
574887Schin
584887Schin static unsigned char terminators[] = { '\n', 0x15, 0x25 };
594887Schin
604887Schin /*
614887Schin * check for V format
624887Schin */
634887Schin
644887Schin s = (unsigned char*)buf;
654887Schin t = s + size;
664887Schin while ((k = (t - s)) >= 4 && !s[2] && !s[3])
674887Schin {
684887Schin if ((i = (s[0]<<8)|s[1]) > k)
694887Schin break;
704887Schin s += i;
714887Schin }
724887Schin if (!k || size > 2 * k)
734887Schin return REC_V_TYPE(4, 0, 2, 0, 1);
744887Schin s = (unsigned char*)buf;
754887Schin
764887Schin /*
774887Schin * check for terminated records
784887Schin */
794887Schin
804887Schin for (i = 0; i < elementsof(terminators); i++)
814887Schin if ((t = (unsigned char*)memchr((void*)s, k = terminators[i], size / 2)) && (n = t - s + 1) > 1 && (total <= 0 || !(total % n)))
824887Schin {
834887Schin for (j = n - 1; j < size; j += n)
844887Schin if (s[j] != k)
854887Schin {
864887Schin n = 0;
874887Schin break;
884887Schin }
894887Schin if (n)
904887Schin return REC_D_TYPE(terminators[i]);
914887Schin }
924887Schin
934887Schin /*
944887Schin * check fixed length record frequencies
954887Schin */
964887Schin
974887Schin if (!(q = newof(0, Sample_t, 1, 0)))
984887Schin return REC_N_TYPE();
994887Schin x = 0;
1004887Schin for (i = 0; i < size; i++)
1014887Schin {
1024887Schin h = q->hit + s[i];
1034887Schin m = i - *h;
1044887Schin *h = i;
1054887Schin if (m < elementsof(q->rep))
1064887Schin {
1074887Schin if (m > x)
1084887Schin x = m;
1094887Schin q->rep[m]++;
1104887Schin }
1114887Schin }
1124887Schin n = 0;
1134887Schin m = 0;
1144887Schin f = ~0;
1154887Schin for (i = x; i > 1; i--)
1164887Schin {
1174887Schin if ((total <= 0 || !(total % i)) && q->rep[i] > q->rep[n])
1184887Schin {
1194887Schin m++;
1204887Schin g = 0;
1214887Schin for (j = i; j < size - i; j += i)
1224887Schin for (k = 0; k < i; k++)
1234887Schin if (s[j + k] != s[j + k - i])
1244887Schin g++;
1254887Schin g = (((g * 100) / i) * 100) / q->rep[i];
1264887Schin if (g <= f)
1274887Schin {
1284887Schin f = g;
1294887Schin n = i;
1304887Schin }
1314887Schin }
1324887Schin }
1334887Schin if (m <= 1 && n <= 2 && total > 1 && total < 256)
1344887Schin {
1354887Schin n = 0;
1364887Schin for (i = 0; i < size; i++)
1374887Schin for (j = 0; j < elementsof(terminators); j++)
1384887Schin if (s[i] == terminators[j])
1394887Schin n++;
1404887Schin n = n ? 0 : total;
1414887Schin }
1424887Schin free(q);
1434887Schin return n ? REC_F_TYPE(n) : REC_N_TYPE();
1444887Schin }
1454887Schin
1464887Schin #if MAIN
1474887Schin
main()1484887Schin main()
1494887Schin {
1504887Schin void* s;
1514887Schin size_t size;
1524887Schin off_t total;
1534887Schin
1544887Schin if (!(s = sfreserve(sfstdin, SF_UNBOUND, 0)))
1554887Schin {
1564887Schin sfprintf(sfstderr, "read error\n");
1574887Schin return 1;
1584887Schin }
1594887Schin size = sfvalue(sfstdin);
1604887Schin total = sfsize(sfstdin);
1614887Schin sfprintf(sfstdout, "%d\n", recfmt(s, size, total));
1624887Schin return 0;
1634887Schin }
1644887Schin
1654887Schin #endif
166