1*5eb26e73Schristos /* $NetBSD: uniq.c,v 1.7 2021/03/22 03:28:55 christos Exp $ */
24f52df9aSchristos
34f52df9aSchristos /*-
44f52df9aSchristos * Copyright (c) 2007 The NetBSD Foundation, Inc.
54f52df9aSchristos * All rights reserved.
64f52df9aSchristos *
74f52df9aSchristos * This code is derived from software contributed to The NetBSD Foundation
84f52df9aSchristos * by Christos Zoulas.
94f52df9aSchristos *
104f52df9aSchristos * Redistribution and use in source and binary forms, with or without
114f52df9aSchristos * modification, are permitted provided that the following conditions
124f52df9aSchristos * are met:
134f52df9aSchristos * 1. Redistributions of source code must retain the above copyright
144f52df9aSchristos * notice, this list of conditions and the following disclaimer.
154f52df9aSchristos * 2. Redistributions in binary form must reproduce the above copyright
164f52df9aSchristos * notice, this list of conditions and the following disclaimer in the
174f52df9aSchristos * documentation and/or other materials provided with the distribution.
184f52df9aSchristos *
194f52df9aSchristos * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
204f52df9aSchristos * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
214f52df9aSchristos * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
224f52df9aSchristos * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
234f52df9aSchristos * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
244f52df9aSchristos * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
254f52df9aSchristos * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
264f52df9aSchristos * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
274f52df9aSchristos * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
284f52df9aSchristos * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
294f52df9aSchristos * POSSIBILITY OF SUCH DAMAGE.
304f52df9aSchristos */
314f52df9aSchristos #include <sys/cdefs.h>
32*5eb26e73Schristos __RCSID("$NetBSD: uniq.c,v 1.7 2021/03/22 03:28:55 christos Exp $");
334f52df9aSchristos
344f52df9aSchristos #include <stdio.h>
354f52df9aSchristos #include <string.h>
364f52df9aSchristos #include <stdlib.h>
374f52df9aSchristos #include <db.h>
384f52df9aSchristos #include <err.h>
394f52df9aSchristos #include <util.h>
404f52df9aSchristos #include <ctype.h>
414f52df9aSchristos #include <fcntl.h>
424f52df9aSchristos
43b9cf7d31Sjoerg #include "extern.h"
444f52df9aSchristos
45b9cf7d31Sjoerg static const HASHINFO hinfo = {
46b9cf7d31Sjoerg .bsize = 256,
47b9cf7d31Sjoerg .ffactor = 4,
48b9cf7d31Sjoerg .nelem = 32768,
49b9cf7d31Sjoerg .cachesize = 1024,
50b9cf7d31Sjoerg .hash = NULL,
51b9cf7d31Sjoerg .lorder = 0
52b9cf7d31Sjoerg };
53b9cf7d31Sjoerg
544f52df9aSchristos static int comp(const char *, char **, size_t *);
554f52df9aSchristos
564f52df9aSchristos /*
5765c07d0bSchristos * Preserve only unique content lines in a file. Input lines that have
584f52df9aSchristos * content [alphanumeric characters before a comment] are white-space
594f52df9aSchristos * normalized and have their comments removed. Then they are placed
604f52df9aSchristos * in a hash table, and only the first instance of them is printed.
614f52df9aSchristos * Comment lines without any alphanumeric content are always printed
624f52df9aSchristos * since they are there to make the file "pretty". Comment lines with
634f52df9aSchristos * alphanumeric content are also placed into the hash table and only
644f52df9aSchristos * printed once.
654f52df9aSchristos */
664f52df9aSchristos void
uniq(const char * fname)674f52df9aSchristos uniq(const char *fname)
684f52df9aSchristos {
694f52df9aSchristos DB *db;
704f52df9aSchristos DBT key;
714f52df9aSchristos static const DBT data = { NULL, 0 };
724f52df9aSchristos FILE *fp;
734f52df9aSchristos char *line;
744f52df9aSchristos size_t len;
754f52df9aSchristos
764f52df9aSchristos if ((db = dbopen(NULL, O_RDWR, 0, DB_HASH, &hinfo)) == NULL)
774f52df9aSchristos err(1, "Cannot create in memory database");
784f52df9aSchristos
7965c07d0bSchristos fp = efopen(fname, "r");
804f52df9aSchristos while ((line = fgetln(fp, &len)) != NULL) {
814f52df9aSchristos size_t complen = len;
824f52df9aSchristos char *compline;
834f52df9aSchristos if (!comp(line, &compline, &complen)) {
844f52df9aSchristos (void)fprintf(stdout, "%*.*s", (int)len, (int)len,
854f52df9aSchristos line);
864f52df9aSchristos continue;
874f52df9aSchristos }
884f52df9aSchristos key.data = compline;
894f52df9aSchristos key.size = complen;
904f52df9aSchristos switch ((db->put)(db, &key, &data, R_NOOVERWRITE)) {
914f52df9aSchristos case 0:
924f52df9aSchristos (void)fprintf(stdout, "%*.*s", (int)len, (int)len,
934f52df9aSchristos line);
944f52df9aSchristos break;
954f52df9aSchristos case 1:
964f52df9aSchristos break;
974f52df9aSchristos case -1:
984f52df9aSchristos err(1, "put");
990ff5eeabSchristos /*NOTREACHED*/
1004f52df9aSchristos default:
1014f52df9aSchristos abort();
1024f52df9aSchristos break;
1034f52df9aSchristos }
1044f52df9aSchristos }
1054f52df9aSchristos (void)fflush(stdout);
1064f52df9aSchristos exit(0);
1074f52df9aSchristos }
1084f52df9aSchristos
1094f52df9aSchristos /*
1104f52df9aSchristos * normalize whitespace in the original line and place a new string
11136619a20Schristos * with whitespace converted to a single space in compline. If the line
1124f52df9aSchristos * contains just comments, we preserve them. If it contains data and
1134f52df9aSchristos * comments, we kill the comments. Return 1 if the line had actual
11436619a20Schristos * contents, or 0 if it was just a comment without alphanumeric characters.
1154f52df9aSchristos */
1164f52df9aSchristos static int
comp(const char * origline,char ** compline,size_t * len)1174f52df9aSchristos comp(const char *origline, char **compline, size_t *len)
1184f52df9aSchristos {
1194f52df9aSchristos const unsigned char *p;
1204f52df9aSchristos unsigned char *q;
1214f52df9aSchristos char *cline;
1224f52df9aSchristos size_t l = *len, complen;
12336619a20Schristos int hasalnum, iscomment;
1244f52df9aSchristos
12536619a20Schristos /* Eat leading space */
1264f52df9aSchristos for (p = (const unsigned char *)origline; l && *p && isspace(*p);
1274f52df9aSchristos p++, l--)
1284f52df9aSchristos continue;
129*5eb26e73Schristos if (*p == '\0' || l == 0)
130*5eb26e73Schristos return 0;
131*5eb26e73Schristos
1324f52df9aSchristos cline = emalloc(l + 1);
1334f52df9aSchristos (void)memcpy(cline, p, l);
1344f52df9aSchristos cline[l] = '\0';
1354f52df9aSchristos
1364f52df9aSchristos complen = 0;
1374f52df9aSchristos hasalnum = 0;
13836619a20Schristos iscomment = 0;
13936619a20Schristos
1404f52df9aSchristos for (q = (unsigned char *)cline; l && *p; p++, l--) {
1414f52df9aSchristos if (isspace(*p)) {
14236619a20Schristos if (complen && isspace(q[-1]))
1434f52df9aSchristos continue;
1444f52df9aSchristos *q++ = ' ';
1454f52df9aSchristos complen++;
14636619a20Schristos } else {
14736619a20Schristos if (!iscomment && *p == '#') {
14836619a20Schristos if (hasalnum)
1494f52df9aSchristos break;
1504f52df9aSchristos iscomment = 1;
1514f52df9aSchristos } else
1524f52df9aSchristos hasalnum |= isalnum(*p);
1534f52df9aSchristos *q++ = *p;
1544f52df9aSchristos complen++;
1554f52df9aSchristos }
15636619a20Schristos }
15736619a20Schristos
15836619a20Schristos /* Eat trailing space */
15936619a20Schristos while (complen && isspace(q[-1])) {
16036619a20Schristos --q;
16136619a20Schristos --complen;
16236619a20Schristos }
1634f52df9aSchristos *q = '\0';
164*5eb26e73Schristos if (!hasalnum) {
165*5eb26e73Schristos free(cline);
166*5eb26e73Schristos cline = NULL;
167*5eb26e73Schristos complen = 0;
168*5eb26e73Schristos }
1694f52df9aSchristos *compline = cline;
1704f52df9aSchristos *len = complen;
1714f52df9aSchristos return hasalnum;
1724f52df9aSchristos }
173