1*8462SApril.Chin@Sun.COM#!/usr/bin/ksh93
2*8462SApril.Chin@Sun.COM
3*8462SApril.Chin@Sun.COM#
4*8462SApril.Chin@Sun.COM# CDDL HEADER START
5*8462SApril.Chin@Sun.COM#
6*8462SApril.Chin@Sun.COM# The contents of this file are subject to the terms of the
7*8462SApril.Chin@Sun.COM# Common Development and Distribution License (the "License").
8*8462SApril.Chin@Sun.COM# You may not use this file except in compliance with the License.
9*8462SApril.Chin@Sun.COM#
10*8462SApril.Chin@Sun.COM# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
11*8462SApril.Chin@Sun.COM# or http://www.opensolaris.org/os/licensing.
12*8462SApril.Chin@Sun.COM# See the License for the specific language governing permissions
13*8462SApril.Chin@Sun.COM# and limitations under the License.
14*8462SApril.Chin@Sun.COM#
15*8462SApril.Chin@Sun.COM# When distributing Covered Code, include this CDDL HEADER in each
16*8462SApril.Chin@Sun.COM# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
17*8462SApril.Chin@Sun.COM# If applicable, add the following below this CDDL HEADER, with the
18*8462SApril.Chin@Sun.COM# fields enclosed by brackets "[]" replaced with your own identifying
19*8462SApril.Chin@Sun.COM# information: Portions Copyright [yyyy] [name of copyright owner]
20*8462SApril.Chin@Sun.COM#
21*8462SApril.Chin@Sun.COM# CDDL HEADER END
22*8462SApril.Chin@Sun.COM#
23*8462SApril.Chin@Sun.COM
24*8462SApril.Chin@Sun.COM#
25*8462SApril.Chin@Sun.COM# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
26*8462SApril.Chin@Sun.COM# Use is subject to license terms.
27*8462SApril.Chin@Sun.COM#
28*8462SApril.Chin@Sun.COM
29*8462SApril.Chin@Sun.COM# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant
30*8462SApril.Chin@Sun.COMexport PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin
31*8462SApril.Chin@Sun.COM
32*8462SApril.Chin@Sun.COM# Make sure all math stuff runs in the "C" locale to avoid problems
33*8462SApril.Chin@Sun.COM# with alternative # radix point representations (e.g. ',' instead of
34*8462SApril.Chin@Sun.COM# '.' in de_DE.*-locales). This needs to be set _before_ any
35*8462SApril.Chin@Sun.COM# floating-point constants are defined in this script).
36*8462SApril.Chin@Sun.COMif [[ "${LC_ALL}" != "" ]] ; then
37*8462SApril.Chin@Sun.COM    export \
38*8462SApril.Chin@Sun.COM        LC_MONETARY="${LC_ALL}" \
39*8462SApril.Chin@Sun.COM        LC_MESSAGES="${LC_ALL}" \
40*8462SApril.Chin@Sun.COM        LC_COLLATE="${LC_ALL}" \
41*8462SApril.Chin@Sun.COM        LC_CTYPE="${LC_ALL}"
42*8462SApril.Chin@Sun.COM        unset LC_ALL
43*8462SApril.Chin@Sun.COMfi
44*8462SApril.Chin@Sun.COMexport LC_NUMERIC=C
45*8462SApril.Chin@Sun.COM
46*8462SApril.Chin@Sun.COM# constants values for tokenizer/parser stuff
47*8462SApril.Chin@Sun.COMtypeset -r ch=(
48*8462SApril.Chin@Sun.COM	newline=$'\n'
49*8462SApril.Chin@Sun.COM	tab=$'\t'
50*8462SApril.Chin@Sun.COM	formfeed=$'\f'
51*8462SApril.Chin@Sun.COM)
52*8462SApril.Chin@Sun.COM
53*8462SApril.Chin@Sun.COMfunction fatal_error
54*8462SApril.Chin@Sun.COM{
55*8462SApril.Chin@Sun.COM	print -u2 "${progname}: $*"
56*8462SApril.Chin@Sun.COM	exit 1
57*8462SApril.Chin@Sun.COM}
58*8462SApril.Chin@Sun.COM
59*8462SApril.Chin@Sun.COMfunction printmsg
60*8462SApril.Chin@Sun.COM{
61*8462SApril.Chin@Sun.COM	print -u2 "$*"
62*8462SApril.Chin@Sun.COM}
63*8462SApril.Chin@Sun.COM
64*8462SApril.Chin@Sun.COM
65*8462SApril.Chin@Sun.COMfunction attrstrtoattrarray
66*8462SApril.Chin@Sun.COM{
67*8462SApril.Chin@Sun.COM#set -o xtrace
68*8462SApril.Chin@Sun.COM    typeset s="$1"
69*8462SApril.Chin@Sun.COM    nameref aa=$2 # attribute array
70*8462SApril.Chin@Sun.COM    integer aa_count=0
71*8462SApril.Chin@Sun.COM    integer aa_count=0
72*8462SApril.Chin@Sun.COM    typeset nextattr
73*8462SApril.Chin@Sun.COM    integer currattrlen=0
74*8462SApril.Chin@Sun.COM    typeset tagstr
75*8462SApril.Chin@Sun.COM    typeset tagval
76*8462SApril.Chin@Sun.COM
77*8462SApril.Chin@Sun.COM    while (( ${#s} > 0 )) ; do
78*8462SApril.Chin@Sun.COM        # skip whitespaces
79*8462SApril.Chin@Sun.COM        while [[ "${s:currattrlen:1}" == ~(E)[[:blank:][:space:]] ]] ; do
80*8462SApril.Chin@Sun.COM            (( currattrlen++ ))
81*8462SApril.Chin@Sun.COM        done
82*8462SApril.Chin@Sun.COM        s="${s:currattrlen:${#s}}"
83*8462SApril.Chin@Sun.COM
84*8462SApril.Chin@Sun.COM        # anything left ?
85*8462SApril.Chin@Sun.COM        (( ${#s} == 0 )) && break
86*8462SApril.Chin@Sun.COM
87*8462SApril.Chin@Sun.COM        # Pattern tests:
88*8462SApril.Chin@Sun.COM        #x="foo=bar huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=[^[:blank:]\"]*}"
89*8462SApril.Chin@Sun.COM        #x='foo="ba=r o" huz=123' ; print "${x##~(E)[[:alnum:]_-:]*=\"[^\"]*\"}"
90*8462SApril.Chin@Sun.COM        #x="foo='ba=r o' huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=\'[^\"]*\'}"
91*8462SApril.Chin@Sun.COM        #x="foox huz=123" ; print "${x##~(E)[[:alnum:]_-:]*}"
92*8462SApril.Chin@Sun.COM        # All pattern combined via eregex (w|x|y|z):
93*8462SApril.Chin@Sun.COM        #x='foo="bar=o" huz=123' ; print "${x##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\')}"
94*8462SApril.Chin@Sun.COM        nextattr="${s##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\'|[[:alnum:]_-:]*)}"
95*8462SApril.Chin@Sun.COM        currattrlen=$(( ${#s} - ${#nextattr}))
96*8462SApril.Chin@Sun.COM
97*8462SApril.Chin@Sun.COM        # add entry
98*8462SApril.Chin@Sun.COM        tagstr="${s:0:currattrlen}"
99*8462SApril.Chin@Sun.COM        if [[ "${tagstr}" == *=* ]] ; then
100*8462SApril.Chin@Sun.COM            # normal case: attribute with value
101*8462SApril.Chin@Sun.COM
102*8462SApril.Chin@Sun.COM            tagval="${tagstr#*=}"
103*8462SApril.Chin@Sun.COM
104*8462SApril.Chin@Sun.COM            # strip quotes ('' or "")
105*8462SApril.Chin@Sun.COM            if [[ "${tagval}" == ~(Elr)(\'.*\'|\".*\") ]] ; then
106*8462SApril.Chin@Sun.COM                tagval="${tagval:1:${#tagval}-2}"
107*8462SApril.Chin@Sun.COM            fi
108*8462SApril.Chin@Sun.COM
109*8462SApril.Chin@Sun.COM            aa[${aa_count}]=( name="${tagstr%%=*}" value="${tagval}" )
110*8462SApril.Chin@Sun.COM        else
111*8462SApril.Chin@Sun.COM            # special case for HTML where you have something like <foo baz>
112*8462SApril.Chin@Sun.COM            aa[${aa_count}]=( name="${tagstr}" )
113*8462SApril.Chin@Sun.COM        fi
114*8462SApril.Chin@Sun.COM        (( aa_count++ ))
115*8462SApril.Chin@Sun.COM        (( aa_count > 1000 )) && fatal_error "$0: aa_count too large" # assert
116*8462SApril.Chin@Sun.COM    done
117*8462SApril.Chin@Sun.COM}
118*8462SApril.Chin@Sun.COM
119*8462SApril.Chin@Sun.COM# XML document handler
120*8462SApril.Chin@Sun.COMfunction handle_xml_document
121*8462SApril.Chin@Sun.COM{
122*8462SApril.Chin@Sun.COM#set -o xtrace
123*8462SApril.Chin@Sun.COM    nameref callbacks=${1}
124*8462SApril.Chin@Sun.COM    typeset tag_type="${2}"
125*8462SApril.Chin@Sun.COM    typeset tag_value="${3}"
126*8462SApril.Chin@Sun.COM    typeset tag_attributes="${4}"
127*8462SApril.Chin@Sun.COM    nameref doc=${callbacks["arg_tree"]}
128*8462SApril.Chin@Sun.COM    nameref nodepath="${stack.items[stack.pos]}"
129*8462SApril.Chin@Sun.COM    nameref nodesnum="${stack.items[stack.pos]}num"
130*8462SApril.Chin@Sun.COM
131*8462SApril.Chin@Sun.COM    case "${tag_type}" in
132*8462SApril.Chin@Sun.COM        tag_comment)
133*8462SApril.Chin@Sun.COM            nodepath[${nodesnum}]+=(
134*8462SApril.Chin@Sun.COM                typeset tagtype="comment"
135*8462SApril.Chin@Sun.COM                typeset tagvalue="${tag_value}"
136*8462SApril.Chin@Sun.COM            )
137*8462SApril.Chin@Sun.COM            (( nodesnum++ ))
138*8462SApril.Chin@Sun.COM            ;;
139*8462SApril.Chin@Sun.COM    esac
140*8462SApril.Chin@Sun.COM
141*8462SApril.Chin@Sun.COM#    print "xmltok: '${tag_type}' = '${tag_value}'"
142*8462SApril.Chin@Sun.COM}
143*8462SApril.Chin@Sun.COM
144*8462SApril.Chin@Sun.COMfunction xml_tok
145*8462SApril.Chin@Sun.COM{
146*8462SApril.Chin@Sun.COM    typeset buf=""
147*8462SApril.Chin@Sun.COM    typeset namebuf=""
148*8462SApril.Chin@Sun.COM    typeset attrbuf=""
149*8462SApril.Chin@Sun.COM    typeset c=""
150*8462SApril.Chin@Sun.COM    typeset isendtag # bool: true/false
151*8462SApril.Chin@Sun.COM    typeset issingletag # bool: true/false (used for tags like "<br />")
152*8462SApril.Chin@Sun.COM    nameref callbacks=${1}
153*8462SApril.Chin@Sun.COM
154*8462SApril.Chin@Sun.COM    [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start"
155*8462SApril.Chin@Sun.COM
156*8462SApril.Chin@Sun.COM    while IFS='' read -r -N 1 c ; do
157*8462SApril.Chin@Sun.COM        isendtag=false
158*8462SApril.Chin@Sun.COM
159*8462SApril.Chin@Sun.COM        if [[ "$c" == "<" ]] ; then
160*8462SApril.Chin@Sun.COM	    # flush any text content
161*8462SApril.Chin@Sun.COM            if [[ "$buf" != "" ]] ; then
162*8462SApril.Chin@Sun.COM                [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf"
163*8462SApril.Chin@Sun.COM                buf=""
164*8462SApril.Chin@Sun.COM            fi
165*8462SApril.Chin@Sun.COM
166*8462SApril.Chin@Sun.COM            IFS='' read -r -N 1 c
167*8462SApril.Chin@Sun.COM            if [[ "$c" == "/" ]] ; then
168*8462SApril.Chin@Sun.COM                isendtag=true
169*8462SApril.Chin@Sun.COM            else
170*8462SApril.Chin@Sun.COM                buf="$c"
171*8462SApril.Chin@Sun.COM            fi
172*8462SApril.Chin@Sun.COM            IFS='' read -r -d '>' c
173*8462SApril.Chin@Sun.COM            buf+="$c"
174*8462SApril.Chin@Sun.COM
175*8462SApril.Chin@Sun.COM	    # handle comments
176*8462SApril.Chin@Sun.COM	    if [[ "$buf" == ~(El)!-- ]] ; then
177*8462SApril.Chin@Sun.COM	        # did we read the comment completely ?
178*8462SApril.Chin@Sun.COM	        if [[ "$buf" != ~(Elr)!--.*-- ]] ; then
179*8462SApril.Chin@Sun.COM		    buf+=">"
180*8462SApril.Chin@Sun.COM	            while [[ "$buf" != ~(Elr)!--.*-- ]] ; do
181*8462SApril.Chin@Sun.COM		        IFS='' read -r -N 1 c || break
182*8462SApril.Chin@Sun.COM		        buf+="$c"
183*8462SApril.Chin@Sun.COM		    done
184*8462SApril.Chin@Sun.COM		fi
185*8462SApril.Chin@Sun.COM
186*8462SApril.Chin@Sun.COM		[[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}"
187*8462SApril.Chin@Sun.COM		buf=""
188*8462SApril.Chin@Sun.COM		continue
189*8462SApril.Chin@Sun.COM	    fi
190*8462SApril.Chin@Sun.COM
191*8462SApril.Chin@Sun.COM	    # check if the tag starts and ends at the same time (like "<br />")
192*8462SApril.Chin@Sun.COM	    if [[ "${buf}" == ~(Er).*/ ]] ; then
193*8462SApril.Chin@Sun.COM	        issingletag=true
194*8462SApril.Chin@Sun.COM		buf="${buf%*/}"
195*8462SApril.Chin@Sun.COM	    else
196*8462SApril.Chin@Sun.COM	        issingletag=false
197*8462SApril.Chin@Sun.COM	    fi
198*8462SApril.Chin@Sun.COM
199*8462SApril.Chin@Sun.COM	    # check if the tag has attributes (e.g. space after name)
200*8462SApril.Chin@Sun.COM	    if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then
201*8462SApril.Chin@Sun.COM	        namebuf="${buf%%~(E)[[:space:][:blank:]].*}"
202*8462SApril.Chin@Sun.COM                attrbuf="${buf#~(E).*[[:space:][:blank:]]}"
203*8462SApril.Chin@Sun.COM            else
204*8462SApril.Chin@Sun.COM	        namebuf="$buf"
205*8462SApril.Chin@Sun.COM		attrbuf=""
206*8462SApril.Chin@Sun.COM	    fi
207*8462SApril.Chin@Sun.COM
208*8462SApril.Chin@Sun.COM            if ${isendtag} ; then
209*8462SApril.Chin@Sun.COM                [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
210*8462SApril.Chin@Sun.COM            else
211*8462SApril.Chin@Sun.COM                [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf"
212*8462SApril.Chin@Sun.COM
213*8462SApril.Chin@Sun.COM                # handle tags like <br/> (which are start- and end-tag in one piece)
214*8462SApril.Chin@Sun.COM                if ${issingletag} ; then
215*8462SApril.Chin@Sun.COM                    [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
216*8462SApril.Chin@Sun.COM                fi
217*8462SApril.Chin@Sun.COM            fi
218*8462SApril.Chin@Sun.COM            buf=""
219*8462SApril.Chin@Sun.COM        else
220*8462SApril.Chin@Sun.COM            buf+="$c"
221*8462SApril.Chin@Sun.COM        fi
222*8462SApril.Chin@Sun.COM    done
223*8462SApril.Chin@Sun.COM
224*8462SApril.Chin@Sun.COM    [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success"
225*8462SApril.Chin@Sun.COM
226*8462SApril.Chin@Sun.COM    print # final newline to make filters like "sed" happy
227*8462SApril.Chin@Sun.COM}
228*8462SApril.Chin@Sun.COM
229*8462SApril.Chin@Sun.COM# enumerate comments in a shell (or shell-like) script
230*8462SApril.Chin@Sun.COMfunction enumerate_comments_shell
231*8462SApril.Chin@Sun.COM{
232*8462SApril.Chin@Sun.COM	set -o errexit
233*8462SApril.Chin@Sun.COM
234*8462SApril.Chin@Sun.COM	typeset input_file="$1"
235*8462SApril.Chin@Sun.COM	nameref comment_array="$2"
236*8462SApril.Chin@Sun.COM	integer max_num_comments="$3"
237*8462SApril.Chin@Sun.COM	integer ca=0 # index in "comment_array"
238*8462SApril.Chin@Sun.COM
239*8462SApril.Chin@Sun.COM	integer res=0
240*8462SApril.Chin@Sun.COM
241*8462SApril.Chin@Sun.COM	typeset comment=""
242*8462SApril.Chin@Sun.COM
243*8462SApril.Chin@Sun.COM	while (( res == 0 )) ; do
244*8462SApril.Chin@Sun.COM		IFS='' read -r line
245*8462SApril.Chin@Sun.COM		(( res=$? ))
246*8462SApril.Chin@Sun.COM
247*8462SApril.Chin@Sun.COM		if [[ "${line}" == ~(El)#.* ]] ; then
248*8462SApril.Chin@Sun.COM			comment+="${line#\#}${ch.newline}"
249*8462SApril.Chin@Sun.COM		else
250*8462SApril.Chin@Sun.COM			if [[ "$comment" != "" ]] ; then
251*8462SApril.Chin@Sun.COM				comment_array[ca++]="${comment}"
252*8462SApril.Chin@Sun.COM				comment=""
253*8462SApril.Chin@Sun.COM
254*8462SApril.Chin@Sun.COM				if (( ca > max_num_comments )) ; then
255*8462SApril.Chin@Sun.COM					break
256*8462SApril.Chin@Sun.COM				fi
257*8462SApril.Chin@Sun.COM			fi
258*8462SApril.Chin@Sun.COM		fi
259*8462SApril.Chin@Sun.COM	done <"${input_file}"
260*8462SApril.Chin@Sun.COM
261*8462SApril.Chin@Sun.COM	return 0
262*8462SApril.Chin@Sun.COM}
263*8462SApril.Chin@Sun.COM
264*8462SApril.Chin@Sun.COM
265*8462SApril.Chin@Sun.COM# enumerate comments in a troff document
266*8462SApril.Chin@Sun.COMfunction enumerate_comments_troff
267*8462SApril.Chin@Sun.COM{
268*8462SApril.Chin@Sun.COM	set -o errexit
269*8462SApril.Chin@Sun.COM
270*8462SApril.Chin@Sun.COM	typeset input_file="$1"
271*8462SApril.Chin@Sun.COM	nameref comment_array="$2"
272*8462SApril.Chin@Sun.COM	integer max_num_comments="$3"
273*8462SApril.Chin@Sun.COM	integer ca=0 # index in "comment_array"
274*8462SApril.Chin@Sun.COM
275*8462SApril.Chin@Sun.COM	integer res=0
276*8462SApril.Chin@Sun.COM
277*8462SApril.Chin@Sun.COM	typeset comment=""
278*8462SApril.Chin@Sun.COM
279*8462SApril.Chin@Sun.COM	while (( res == 0 )) ; do
280*8462SApril.Chin@Sun.COM		IFS='' read -r line
281*8462SApril.Chin@Sun.COM		(( res=$? ))
282*8462SApril.Chin@Sun.COM
283*8462SApril.Chin@Sun.COM		if [[ "${line}" == ~(El)\.*\\\" ]] ; then
284*8462SApril.Chin@Sun.COM			comment+="${line#~(El)\.*\\\"}${ch.newline}"
285*8462SApril.Chin@Sun.COM		else
286*8462SApril.Chin@Sun.COM			if [[ "$comment" != "" ]] ; then
287*8462SApril.Chin@Sun.COM				comment_array[ca++]="${comment}"
288*8462SApril.Chin@Sun.COM				comment=""
289*8462SApril.Chin@Sun.COM
290*8462SApril.Chin@Sun.COM				if (( ca > max_num_comments )) ; then
291*8462SApril.Chin@Sun.COM					break
292*8462SApril.Chin@Sun.COM				fi
293*8462SApril.Chin@Sun.COM			fi
294*8462SApril.Chin@Sun.COM		fi
295*8462SApril.Chin@Sun.COM	done <"${input_file}"
296*8462SApril.Chin@Sun.COM
297*8462SApril.Chin@Sun.COM	return 0
298*8462SApril.Chin@Sun.COM}
299*8462SApril.Chin@Sun.COM
300*8462SApril.Chin@Sun.COM
301*8462SApril.Chin@Sun.COM# enumerate comments in files which are preprocessed by
302*8462SApril.Chin@Sun.COM# CPP (e.g. C, C++, Imakefile etc.)
303*8462SApril.Chin@Sun.COMfunction enumerate_comments_cpp
304*8462SApril.Chin@Sun.COM{
305*8462SApril.Chin@Sun.COM	set -o errexit
306*8462SApril.Chin@Sun.COM#	set -o nounset
307*8462SApril.Chin@Sun.COM
308*8462SApril.Chin@Sun.COM	integer err=0
309*8462SApril.Chin@Sun.COM
310*8462SApril.Chin@Sun.COM	typeset input_file="$1"
311*8462SApril.Chin@Sun.COM	nameref comment_array="$2"
312*8462SApril.Chin@Sun.COM	integer max_num_comments="$3"
313*8462SApril.Chin@Sun.COM	integer max_filesize_for_scan="$4"
314*8462SApril.Chin@Sun.COM	integer ca=0 # index in "comment_array"
315*8462SApril.Chin@Sun.COM
316*8462SApril.Chin@Sun.COM	typeset content
317*8462SApril.Chin@Sun.COM	integer content_length
318*8462SApril.Chin@Sun.COM
319*8462SApril.Chin@Sun.COM	integer file_pos # file position
320*8462SApril.Chin@Sun.COM	typeset line_pos=(
321*8462SApril.Chin@Sun.COM		integer x=0 # X position in line
322*8462SApril.Chin@Sun.COM		integer y=0 # Y position in line (line number)
323*8462SApril.Chin@Sun.COM	)
324*8462SApril.Chin@Sun.COM	typeset c c2
325*8462SApril.Chin@Sun.COM
326*8462SApril.Chin@Sun.COM	typeset comment
327*8462SApril.Chin@Sun.COM
328*8462SApril.Chin@Sun.COM	typeset state=(
329*8462SApril.Chin@Sun.COM		# C comment state
330*8462SApril.Chin@Sun.COM		typeset in_c_comment=false
331*8462SApril.Chin@Sun.COM		# C++ comment state
332*8462SApril.Chin@Sun.COM		typeset cxx=(
333*8462SApril.Chin@Sun.COM			typeset in_comment=false
334*8462SApril.Chin@Sun.COM			typeset comment_continued=false
335*8462SApril.Chin@Sun.COM			# position of current //-pos
336*8462SApril.Chin@Sun.COM			typeset comment_pos=(
337*8462SApril.Chin@Sun.COM				integer x=-1
338*8462SApril.Chin@Sun.COM				integer y=-1
339*8462SApril.Chin@Sun.COM			)
340*8462SApril.Chin@Sun.COM			# position of previous //-pos
341*8462SApril.Chin@Sun.COM			typeset comment_prev_pos=(
342*8462SApril.Chin@Sun.COM				integer x=-1
343*8462SApril.Chin@Sun.COM				integer y=-1
344*8462SApril.Chin@Sun.COM			)
345*8462SApril.Chin@Sun.COM		)
346*8462SApril.Chin@Sun.COM		# literal state
347*8462SApril.Chin@Sun.COM		typeset in_sq_literal=false # single-quote literal
348*8462SApril.Chin@Sun.COM		typeset in_dq_literal=false # double-quote literal
349*8462SApril.Chin@Sun.COM	)
350*8462SApril.Chin@Sun.COM
351*8462SApril.Chin@Sun.COM	content="$(< "${input_file}")"
352*8462SApril.Chin@Sun.COM
353*8462SApril.Chin@Sun.COM	# Truncate file to "max_filesize_for_scan" charatcters.
354*8462SApril.Chin@Sun.COM	# This was originally added to work around a performance problem with
355*8462SApril.Chin@Sun.COM	# the ${str:offset:chunksize} operator which scales badly in ksh93
356*8462SApril.Chin@Sun.COM	# version 's' with the number of characters
357*8462SApril.Chin@Sun.COM	if (( ${#content} > max_filesize_for_scan )) ; then
358*8462SApril.Chin@Sun.COM		print -u2 -f "## WARNING: File '%s' truncated to %d characters\n" \
359*8462SApril.Chin@Sun.COM			"${input_file}" \
360*8462SApril.Chin@Sun.COM			max_filesize_for_scan
361*8462SApril.Chin@Sun.COM		content="${content:0:max_filesize_for_scan}"
362*8462SApril.Chin@Sun.COM	fi
363*8462SApril.Chin@Sun.COM	content_length=${#content}
364*8462SApril.Chin@Sun.COM
365*8462SApril.Chin@Sun.COM	# Iterate through the source code. The last character
366*8462SApril.Chin@Sun.COM	# (when file_pos == content_length) will be empty to indicate
367*8462SApril.Chin@Sun.COM	# EOF (this is needed for cases like when
368*8462SApril.Chin@Sun.COM	# a C++ comment is not terminated by a newline... ;-/)
369*8462SApril.Chin@Sun.COM	for (( file_pos=0 ; file_pos <= content_length ; file_pos++ )) ; do
370*8462SApril.Chin@Sun.COM		c2="${content:file_pos:2}"
371*8462SApril.Chin@Sun.COM		c="${c2:0:1}"
372*8462SApril.Chin@Sun.COM
373*8462SApril.Chin@Sun.COM		if [[ "$c" == "${ch.newline}" ]] ; then
374*8462SApril.Chin@Sun.COM			(( line_pos.x=0, line_pos.y++ ))
375*8462SApril.Chin@Sun.COM		else
376*8462SApril.Chin@Sun.COM			(( line_pos.x++ ))
377*8462SApril.Chin@Sun.COM		fi
378*8462SApril.Chin@Sun.COM
379*8462SApril.Chin@Sun.COM		if ${state.in_c_comment} ; then
380*8462SApril.Chin@Sun.COM			if [[ "$c2" == "*/" ]] ; then
381*8462SApril.Chin@Sun.COM				(( file_pos++, line_pos.x++ ))
382*8462SApril.Chin@Sun.COM				state.in_c_comment=false
383*8462SApril.Chin@Sun.COM
384*8462SApril.Chin@Sun.COM				# flush comment text
385*8462SApril.Chin@Sun.COM				comment_array[ca++]="${comment}"
386*8462SApril.Chin@Sun.COM				comment=""
387*8462SApril.Chin@Sun.COM
388*8462SApril.Chin@Sun.COM				if (( ca > max_num_comments )) ; then
389*8462SApril.Chin@Sun.COM					break
390*8462SApril.Chin@Sun.COM				fi
391*8462SApril.Chin@Sun.COM			else
392*8462SApril.Chin@Sun.COM				comment+="$c"
393*8462SApril.Chin@Sun.COM			fi
394*8462SApril.Chin@Sun.COM		elif ${state.cxx.in_comment} ; then
395*8462SApril.Chin@Sun.COM			if [[ "$c" == "${ch.newline}" || "$c" == "" ]] ; then
396*8462SApril.Chin@Sun.COM				state.cxx.in_comment=false
397*8462SApril.Chin@Sun.COM
398*8462SApril.Chin@Sun.COM				# flush comment text
399*8462SApril.Chin@Sun.COM				if ${state.cxx.comment_continued} ; then
400*8462SApril.Chin@Sun.COM					comment_array[ca-1]+="${ch.newline}${comment}"
401*8462SApril.Chin@Sun.COM					(( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x ,
402*8462SApril.Chin@Sun.COM					   state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y ))
403*8462SApril.Chin@Sun.COM				else
404*8462SApril.Chin@Sun.COM					comment_array[ca++]="${comment}"
405*8462SApril.Chin@Sun.COM					(( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x ,
406*8462SApril.Chin@Sun.COM					   state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y ))
407*8462SApril.Chin@Sun.COM				fi
408*8462SApril.Chin@Sun.COM				comment=""
409*8462SApril.Chin@Sun.COM
410*8462SApril.Chin@Sun.COM				if (( ca > max_num_comments )) ; then
411*8462SApril.Chin@Sun.COM					break
412*8462SApril.Chin@Sun.COM				fi
413*8462SApril.Chin@Sun.COM			else
414*8462SApril.Chin@Sun.COM				comment+="$c"
415*8462SApril.Chin@Sun.COM			fi
416*8462SApril.Chin@Sun.COM		elif ${state.in_sq_literal} ; then
417*8462SApril.Chin@Sun.COM			if [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then
418*8462SApril.Chin@Sun.COM				state.in_sq_literal=false
419*8462SApril.Chin@Sun.COM			fi
420*8462SApril.Chin@Sun.COM		elif ${state.in_dq_literal} ; then
421*8462SApril.Chin@Sun.COM			if [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then
422*8462SApril.Chin@Sun.COM				state.in_dq_literal=false
423*8462SApril.Chin@Sun.COM			fi
424*8462SApril.Chin@Sun.COM		else
425*8462SApril.Chin@Sun.COM			if [[ "$c2" == "/*" ]] ; then
426*8462SApril.Chin@Sun.COM				(( file_pos++, line_pos.x++ ))
427*8462SApril.Chin@Sun.COM				state.in_c_comment=true
428*8462SApril.Chin@Sun.COM				comment=""
429*8462SApril.Chin@Sun.COM			elif [[ "$c2" == "//" ]] ; then
430*8462SApril.Chin@Sun.COM				(( file_pos++, line_pos.x++ ))
431*8462SApril.Chin@Sun.COM				if (( state.cxx.comment_prev_pos.x == line_pos.x && \
432*8462SApril.Chin@Sun.COM					state.cxx.comment_prev_pos.y == (line_pos.y-1) )) ; then
433*8462SApril.Chin@Sun.COM					state.cxx.comment_continued=true
434*8462SApril.Chin@Sun.COM			else
435*8462SApril.Chin@Sun.COM				state.cxx.comment_continued=false
436*8462SApril.Chin@Sun.COM			fi
437*8462SApril.Chin@Sun.COM			(( state.cxx.comment_pos.x=line_pos.x , state.cxx.comment_pos.y=line_pos.y ))
438*8462SApril.Chin@Sun.COM			state.cxx.in_comment=true
439*8462SApril.Chin@Sun.COM			comment=""
440*8462SApril.Chin@Sun.COM			elif [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then
441*8462SApril.Chin@Sun.COM				state.in_sq_literal=true
442*8462SApril.Chin@Sun.COM			elif [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then
443*8462SApril.Chin@Sun.COM				state.in_dq_literal=true
444*8462SApril.Chin@Sun.COM			fi
445*8462SApril.Chin@Sun.COM		fi
446*8462SApril.Chin@Sun.COM	done
447*8462SApril.Chin@Sun.COM
448*8462SApril.Chin@Sun.COM	if [[ "$comment" != "" ]] ; then
449*8462SApril.Chin@Sun.COM		print -u2 "## ERROR: Comment text buffer not empty at EOF."
450*8462SApril.Chin@Sun.COM		err=1
451*8462SApril.Chin@Sun.COM	fi
452*8462SApril.Chin@Sun.COM
453*8462SApril.Chin@Sun.COM	if ${state.in_c_comment} ; then
454*8462SApril.Chin@Sun.COM		print -u2 "## ERROR: C comment did not close before EOF."
455*8462SApril.Chin@Sun.COM		err=1
456*8462SApril.Chin@Sun.COM	fi
457*8462SApril.Chin@Sun.COM
458*8462SApril.Chin@Sun.COM	if ${state.cxx.in_comment} ; then
459*8462SApril.Chin@Sun.COM		print -u2 "## ERROR: C++ comment did not close before EOF."
460*8462SApril.Chin@Sun.COM		err=1
461*8462SApril.Chin@Sun.COM	fi
462*8462SApril.Chin@Sun.COM
463*8462SApril.Chin@Sun.COM	if ${state.in_dq_literal} ; then
464*8462SApril.Chin@Sun.COM		print -u2 "## ERROR: Double-quoted literal did not close before EOF."
465*8462SApril.Chin@Sun.COM		err=1
466*8462SApril.Chin@Sun.COM	fi
467*8462SApril.Chin@Sun.COM
468*8462SApril.Chin@Sun.COM	# We treat this one only as warning since things like "foo.html.cpp" may
469*8462SApril.Chin@Sun.COM	# trigger this condition accidently
470*8462SApril.Chin@Sun.COM	if ${state.in_sq_literal} ; then
471*8462SApril.Chin@Sun.COM		print -u2 "## WARNING: Single-quoted literal did not close before EOF."
472*8462SApril.Chin@Sun.COM	fi
473*8462SApril.Chin@Sun.COM
474*8462SApril.Chin@Sun.COM	return $err
475*8462SApril.Chin@Sun.COM}
476*8462SApril.Chin@Sun.COM
477*8462SApril.Chin@Sun.COM# determine file type
478*8462SApril.Chin@Sun.COMfunction get_file_format
479*8462SApril.Chin@Sun.COM{
480*8462SApril.Chin@Sun.COM	set -o errexit
481*8462SApril.Chin@Sun.COM
482*8462SApril.Chin@Sun.COM	typeset filename="$1"
483*8462SApril.Chin@Sun.COM	nameref file_format="$2"
484*8462SApril.Chin@Sun.COM
485*8462SApril.Chin@Sun.COM	typeset fileeval # evaluation result of /usr/bin/file
486*8462SApril.Chin@Sun.COM
487*8462SApril.Chin@Sun.COM	# check whether "filename" is a plain, readable file
488*8462SApril.Chin@Sun.COM	[[ ! -f "$filename" ]] && return 1
489*8462SApril.Chin@Sun.COM	[[ ! -r "$filename" ]] && return 1
490*8462SApril.Chin@Sun.COM
491*8462SApril.Chin@Sun.COM	# In theory this code would exclusively look at the contents of
492*8462SApril.Chin@Sun.COM	# the file to figure out it's file format - unfortunately
493*8462SApril.Chin@Sun.COM	# /usr/bin/file is virtually useless (the heuristics, matching
494*8462SApril.Chin@Sun.COM	# and output unreliable) for many file formats and therefore
495*8462SApril.Chin@Sun.COM	# we have to do a multi-stage approach which looks
496*8462SApril.Chin@Sun.COM	# at the file's content if possible and at the filename
497*8462SApril.Chin@Sun.COM	# otherwise. Fun... ;-(
498*8462SApril.Chin@Sun.COM
499*8462SApril.Chin@Sun.COM	# pass one: Find matches for file formats where /usr/bin/file
500*8462SApril.Chin@Sun.COM	# is known to be unreliable:
501*8462SApril.Chin@Sun.COM	case "$filename" in
502*8462SApril.Chin@Sun.COM		*.[ch] | *.cpp | *.cc | *.cxx | *.hxx)
503*8462SApril.Chin@Sun.COM			file_format="c_source"
504*8462SApril.Chin@Sun.COM			return 0
505*8462SApril.Chin@Sun.COM			;;
506*8462SApril.Chin@Sun.COM		*Imakefile)
507*8462SApril.Chin@Sun.COM			file_format="imakefile"
508*8462SApril.Chin@Sun.COM			return 0
509*8462SApril.Chin@Sun.COM			;;
510*8462SApril.Chin@Sun.COM		*Makefile)
511*8462SApril.Chin@Sun.COM			file_format="makefile"
512*8462SApril.Chin@Sun.COM			return 0
513*8462SApril.Chin@Sun.COM			;;
514*8462SApril.Chin@Sun.COM	esac
515*8462SApril.Chin@Sun.COM
516*8462SApril.Chin@Sun.COM	# pass two: match by file content via /usr/bin/file
517*8462SApril.Chin@Sun.COM	fileeval="$(LC_ALL=C /usr/bin/file "$filename")"
518*8462SApril.Chin@Sun.COM	case "$fileeval" in
519*8462SApril.Chin@Sun.COM		~(E)roff)
520*8462SApril.Chin@Sun.COM			file_format="troff"
521*8462SApril.Chin@Sun.COM			return 0
522*8462SApril.Chin@Sun.COM			;;
523*8462SApril.Chin@Sun.COM		~(E)html\ document)
524*8462SApril.Chin@Sun.COM			file_format="html"
525*8462SApril.Chin@Sun.COM			return 0
526*8462SApril.Chin@Sun.COM			;;
527*8462SApril.Chin@Sun.COM		~(E)sgml\ document)
528*8462SApril.Chin@Sun.COM			file_format="sgml"
529*8462SApril.Chin@Sun.COM			return 0
530*8462SApril.Chin@Sun.COM			;;
531*8462SApril.Chin@Sun.COM		~(E)executable.*(shell|(/|/r|/pf)(sh|ksh|ksh93|rksh93|dtksh|tksh|bash))\ script)
532*8462SApril.Chin@Sun.COM			file_format="shell"
533*8462SApril.Chin@Sun.COM			return 0
534*8462SApril.Chin@Sun.COM			;;
535*8462SApril.Chin@Sun.COM		~(E)executable.*/perl\ script)
536*8462SApril.Chin@Sun.COM			file_format="perl"
537*8462SApril.Chin@Sun.COM			return 0
538*8462SApril.Chin@Sun.COM			;;
539*8462SApril.Chin@Sun.COM	esac
540*8462SApril.Chin@Sun.COM
541*8462SApril.Chin@Sun.COM	# pass three: fallhack to filename matching
542*8462SApril.Chin@Sun.COM	case "$filename" in
543*8462SApril.Chin@Sun.COM		*.man)
544*8462SApril.Chin@Sun.COM			file_format="troff"
545*8462SApril.Chin@Sun.COM			return 0
546*8462SApril.Chin@Sun.COM			;;
547*8462SApril.Chin@Sun.COM		*.html)
548*8462SApril.Chin@Sun.COM			file_format="html"
549*8462SApril.Chin@Sun.COM			return 0
550*8462SApril.Chin@Sun.COM			;;
551*8462SApril.Chin@Sun.COM		*.sgml)
552*8462SApril.Chin@Sun.COM			file_format="sgml"
553*8462SApril.Chin@Sun.COM			return 0
554*8462SApril.Chin@Sun.COM			;;
555*8462SApril.Chin@Sun.COM		*.xml)
556*8462SApril.Chin@Sun.COM			file_format="xml"
557*8462SApril.Chin@Sun.COM			return 0
558*8462SApril.Chin@Sun.COM			;;
559*8462SApril.Chin@Sun.COM		*.png)
560*8462SApril.Chin@Sun.COM			file_format="image_png"
561*8462SApril.Chin@Sun.COM			return 0
562*8462SApril.Chin@Sun.COM			;;
563*8462SApril.Chin@Sun.COM		*.xcf)
564*8462SApril.Chin@Sun.COM			file_format="image_xcf"
565*8462SApril.Chin@Sun.COM			return 0
566*8462SApril.Chin@Sun.COM			;;
567*8462SApril.Chin@Sun.COM		*.shar)
568*8462SApril.Chin@Sun.COM			file_format="archive_shell"
569*8462SApril.Chin@Sun.COM			return 0
570*8462SApril.Chin@Sun.COM			;;
571*8462SApril.Chin@Sun.COM		*.sh)
572*8462SApril.Chin@Sun.COM			file_format="shell"
573*8462SApril.Chin@Sun.COM			return 0
574*8462SApril.Chin@Sun.COM			;;
575*8462SApril.Chin@Sun.COM		*.pcf)
576*8462SApril.Chin@Sun.COM			file_format="font_pcf"
577*8462SApril.Chin@Sun.COM			return 0
578*8462SApril.Chin@Sun.COM			;;
579*8462SApril.Chin@Sun.COM		*.bdf)
580*8462SApril.Chin@Sun.COM			file_format="font_bdf"
581*8462SApril.Chin@Sun.COM			return 0
582*8462SApril.Chin@Sun.COM			;;
583*8462SApril.Chin@Sun.COM		*.pmf)
584*8462SApril.Chin@Sun.COM			file_format="font_pmf"
585*8462SApril.Chin@Sun.COM			return 0
586*8462SApril.Chin@Sun.COM			;;
587*8462SApril.Chin@Sun.COM		*.ttf | *.otf)
588*8462SApril.Chin@Sun.COM			file_format="font_ttf"
589*8462SApril.Chin@Sun.COM			return 0
590*8462SApril.Chin@Sun.COM			;;
591*8462SApril.Chin@Sun.COM		*.pfa | *.pfb)
592*8462SApril.Chin@Sun.COM			file_format="font_postscript"
593*8462SApril.Chin@Sun.COM			return 0
594*8462SApril.Chin@Sun.COM			;;
595*8462SApril.Chin@Sun.COM	esac
596*8462SApril.Chin@Sun.COM
597*8462SApril.Chin@Sun.COM	return 1
598*8462SApril.Chin@Sun.COM}
599*8462SApril.Chin@Sun.COM
600*8462SApril.Chin@Sun.COMfunction extract_comments
601*8462SApril.Chin@Sun.COM{
602*8462SApril.Chin@Sun.COM	set -o errexit
603*8462SApril.Chin@Sun.COM
604*8462SApril.Chin@Sun.COM	nameref records="$1"
605*8462SApril.Chin@Sun.COM	typeset filename="$2"
606*8462SApril.Chin@Sun.COM	integer max_num_comments="$3"
607*8462SApril.Chin@Sun.COM	integer max_filesize_for_scan="$4"
608*8462SApril.Chin@Sun.COM
609*8462SApril.Chin@Sun.COM	typeset datatype=""
610*8462SApril.Chin@Sun.COM
611*8462SApril.Chin@Sun.COM	records[${filename}]=(
612*8462SApril.Chin@Sun.COM		typeset filename="$filename"
613*8462SApril.Chin@Sun.COM
614*8462SApril.Chin@Sun.COM		typeset fileformat_found="false" # "true" or "false"
615*8462SApril.Chin@Sun.COM		typeset file_format=""
616*8462SApril.Chin@Sun.COM
617*8462SApril.Chin@Sun.COM		typeset -A hashsum
618*8462SApril.Chin@Sun.COM
619*8462SApril.Chin@Sun.COM		typeset comments_parsed="false" # "true" or "false"
620*8462SApril.Chin@Sun.COM		typeset -a comments
621*8462SApril.Chin@Sun.COM	)
622*8462SApril.Chin@Sun.COM
623*8462SApril.Chin@Sun.COM	records[${filename}].hashsum["md5"]="$(sum  -x md5  < "$filename")"
624*8462SApril.Chin@Sun.COM	records[${filename}].hashsum["sha1"]="$(sum -x sha1 < "$filename")"
625*8462SApril.Chin@Sun.COM
626*8462SApril.Chin@Sun.COM	if get_file_format "$filename" datatype ; then
627*8462SApril.Chin@Sun.COM		records[${filename}].fileformat_found="true"
628*8462SApril.Chin@Sun.COM		records[${filename}].file_format="$datatype"
629*8462SApril.Chin@Sun.COM	else
630*8462SApril.Chin@Sun.COM		return 1
631*8462SApril.Chin@Sun.COM	fi
632*8462SApril.Chin@Sun.COM
633*8462SApril.Chin@Sun.COM	case "$datatype" in
634*8462SApril.Chin@Sun.COM		c_source|imakefile)
635*8462SApril.Chin@Sun.COM			enumerate_comments_cpp "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
636*8462SApril.Chin@Sun.COM				records[${filename}].comments_parsed=true
637*8462SApril.Chin@Sun.COM			;;
638*8462SApril.Chin@Sun.COM		shell|makefile)
639*8462SApril.Chin@Sun.COM			enumerate_comments_shell "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
640*8462SApril.Chin@Sun.COM				records[${filename}].comments_parsed=true
641*8462SApril.Chin@Sun.COM			;;
642*8462SApril.Chin@Sun.COM		troff)
643*8462SApril.Chin@Sun.COM			enumerate_comments_troff "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
644*8462SApril.Chin@Sun.COM				records[${filename}].comments_parsed=true
645*8462SApril.Chin@Sun.COM			;;
646*8462SApril.Chin@Sun.COM		# NOTE: Disabled for now
647*8462SApril.Chin@Sun.COM		#xml|html|sgml)
648*8462SApril.Chin@Sun.COM		#	enumerate_comments_xml "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
649*8462SApril.Chin@Sun.COM		#		records[${filename}].comments_parsed=true
650*8462SApril.Chin@Sun.COM		#	;;
651*8462SApril.Chin@Sun.COM	esac
652*8462SApril.Chin@Sun.COM
653*8462SApril.Chin@Sun.COM	return 0
654*8462SApril.Chin@Sun.COM}
655*8462SApril.Chin@Sun.COM
656*8462SApril.Chin@Sun.COM# parse HTTP return code, cookies etc.
657*8462SApril.Chin@Sun.COMfunction parse_http_response
658*8462SApril.Chin@Sun.COM{
659*8462SApril.Chin@Sun.COM	nameref response="$1"
660*8462SApril.Chin@Sun.COM	typeset h statuscode statusmsg i
661*8462SApril.Chin@Sun.COM
662*8462SApril.Chin@Sun.COM	# we use '\r' as additional IFS to filter the final '\r'
663*8462SApril.Chin@Sun.COM	IFS=$' \t\r' read -r h statuscode statusmsg  # read HTTP/1.[01] <code>
664*8462SApril.Chin@Sun.COM	[[ "$h" != ~(Eil)HTTP/.* ]]         && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; }
665*8462SApril.Chin@Sun.COM	[[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n"  "$0" ; return 1 ; }
666*8462SApril.Chin@Sun.COM	response.statuscode="$statuscode"
667*8462SApril.Chin@Sun.COM	response.statusmsg="$statusmsg"
668*8462SApril.Chin@Sun.COM
669*8462SApril.Chin@Sun.COM	# skip remaining headers
670*8462SApril.Chin@Sun.COM	while IFS='' read -r i ; do
671*8462SApril.Chin@Sun.COM		[[ "$i" == $'\r' ]] && break
672*8462SApril.Chin@Sun.COM
673*8462SApril.Chin@Sun.COM		# strip '\r' at the end
674*8462SApril.Chin@Sun.COM		i="${i/~(Er)$'\r'/}"
675*8462SApril.Chin@Sun.COM
676*8462SApril.Chin@Sun.COM		case "$i" in
677*8462SApril.Chin@Sun.COM			~(Eli)Content-Type:.*)
678*8462SApril.Chin@Sun.COM				response.content_type="${i/~(El).*:[[:blank:]]*/}"
679*8462SApril.Chin@Sun.COM				;;
680*8462SApril.Chin@Sun.COM			~(Eli)Content-Length:[[:blank:]]*[0-9]*)
681*8462SApril.Chin@Sun.COM				integer response.content_length="${i/~(El).*:[[:blank:]]*/}"
682*8462SApril.Chin@Sun.COM				;;
683*8462SApril.Chin@Sun.COM			~(Eli)Transfer-Encoding:.*)
684*8462SApril.Chin@Sun.COM				response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}"
685*8462SApril.Chin@Sun.COM				;;
686*8462SApril.Chin@Sun.COM		esac
687*8462SApril.Chin@Sun.COM	done
688*8462SApril.Chin@Sun.COM
689*8462SApril.Chin@Sun.COM	return 0
690*8462SApril.Chin@Sun.COM}
691*8462SApril.Chin@Sun.COM
692*8462SApril.Chin@Sun.COMfunction cat_http_body
693*8462SApril.Chin@Sun.COM{
694*8462SApril.Chin@Sun.COM	typeset emode="$1"
695*8462SApril.Chin@Sun.COM	typeset hexchunksize="0"
696*8462SApril.Chin@Sun.COM	integer chunksize=0
697*8462SApril.Chin@Sun.COM
698*8462SApril.Chin@Sun.COM	if [[ "${emode}" == "chunked" ]] ; then
699*8462SApril.Chin@Sun.COM		while IFS=$'\r' read hexchunksize &&
700*8462SApril.Chin@Sun.COM			[[ "${hexchunksize}" == ~(Elri)[0-9abcdef]* ]] &&
701*8462SApril.Chin@Sun.COM			(( chunksize=16#${hexchunksize} )) && (( chunksize > 0 )) ; do
702*8462SApril.Chin@Sun.COM			dd bs=1 count="${chunksize}" 2>/dev/null
703*8462SApril.Chin@Sun.COM		done
704*8462SApril.Chin@Sun.COM	else
705*8462SApril.Chin@Sun.COM		cat
706*8462SApril.Chin@Sun.COM	fi
707*8462SApril.Chin@Sun.COM
708*8462SApril.Chin@Sun.COM	return 0
709*8462SApril.Chin@Sun.COM}
710*8462SApril.Chin@Sun.COM
711*8462SApril.Chin@Sun.COMfunction cat_http
712*8462SApril.Chin@Sun.COM{
713*8462SApril.Chin@Sun.COM	typeset protocol="${1%://*}"
714*8462SApril.Chin@Sun.COM	typeset path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html"
715*8462SApril.Chin@Sun.COM
716*8462SApril.Chin@Sun.COM	typeset host="${path1%%/*}"
717*8462SApril.Chin@Sun.COM	typeset path="${path1#*/}"
718*8462SApril.Chin@Sun.COM	typeset port="${host##*:}"
719*8462SApril.Chin@Sun.COM
720*8462SApril.Chin@Sun.COM	integer netfd
721*8462SApril.Chin@Sun.COM	typeset -C httpresponse # http response
722*8462SApril.Chin@Sun.COM
723*8462SApril.Chin@Sun.COM	# If URL did not contain a port number in the host part then look at the
724*8462SApril.Chin@Sun.COM	# protocol to get the port number
725*8462SApril.Chin@Sun.COM	if [[ "${port}" == "${host}" ]] ; then
726*8462SApril.Chin@Sun.COM		case "${protocol}" in
727*8462SApril.Chin@Sun.COM			"http") port=80 ;;
728*8462SApril.Chin@Sun.COM			*)      port="$(getent services "${protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;;
729*8462SApril.Chin@Sun.COM		esac
730*8462SApril.Chin@Sun.COM	else
731*8462SApril.Chin@Sun.COM		host="${host%:*}"
732*8462SApril.Chin@Sun.COM	fi
733*8462SApril.Chin@Sun.COM
734*8462SApril.Chin@Sun.COM	printmsg "protocol=${protocol} port=${port} host=${host} path=${path}"
735*8462SApril.Chin@Sun.COM
736*8462SApril.Chin@Sun.COM	# prechecks
737*8462SApril.Chin@Sun.COM	[[ "${protocol}" == "" ]] && { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; }
738*8462SApril.Chin@Sun.COM	[[ "${port}"     == "" ]] && { print -u2 -f "%s: port not set.\n"     "$0" ; return 1 ; }
739*8462SApril.Chin@Sun.COM	[[ "${host}"     == "" ]] && { print -u2 -f "%s: host not set.\n"     "$0" ; return 1 ; }
740*8462SApril.Chin@Sun.COM	[[ "${path}"     == "" ]] && { print -u2 -f "%s: path not set.\n"     "$0" ; return 1 ; }
741*8462SApril.Chin@Sun.COM
742*8462SApril.Chin@Sun.COM	# open TCP channel
743*8462SApril.Chin@Sun.COM	redirect {netfd}<>"/dev/tcp/${host}/${port}"
744*8462SApril.Chin@Sun.COM	(( $? != 0 )) && { print -u2 -f "%s: Couldn't open %s\n" "$0" "${1}" ; return 1 ; }
745*8462SApril.Chin@Sun.COM
746*8462SApril.Chin@Sun.COM	# send HTTP request
747*8462SApril.Chin@Sun.COM	request="GET /${path} HTTP/1.1\r\n"
748*8462SApril.Chin@Sun.COM	request+="Host: ${host}\r\n"
749*8462SApril.Chin@Sun.COM	request+="User-Agent: crawlsrccomments/ksh93 (2008-06-14; $(uname -s -r -p))\r\n"
750*8462SApril.Chin@Sun.COM	request+="Connection: close\r\n"
751*8462SApril.Chin@Sun.COM	print -n -- "${request}\r\n" >&${netfd}
752*8462SApril.Chin@Sun.COM
753*8462SApril.Chin@Sun.COM	# collect response and send it to stdout
754*8462SApril.Chin@Sun.COM	parse_http_response httpresponse <&${netfd}
755*8462SApril.Chin@Sun.COM	cat_http_body "${httpresponse.transfer_encoding}" <&${netfd}
756*8462SApril.Chin@Sun.COM
757*8462SApril.Chin@Sun.COM	# close connection
758*8462SApril.Chin@Sun.COM	redirect {netfd}<&-
759*8462SApril.Chin@Sun.COM
760*8462SApril.Chin@Sun.COM	return 0
761*8462SApril.Chin@Sun.COM}
762*8462SApril.Chin@Sun.COM
763*8462SApril.Chin@Sun.COMfunction print_stats
764*8462SApril.Chin@Sun.COM{
765*8462SApril.Chin@Sun.COM	set -o errexit
766*8462SApril.Chin@Sun.COM
767*8462SApril.Chin@Sun.COM	# gather some statistics
768*8462SApril.Chin@Sun.COM	typeset stats=(
769*8462SApril.Chin@Sun.COM		integer files_with_comments=0
770*8462SApril.Chin@Sun.COM		integer files_without_comments=0
771*8462SApril.Chin@Sun.COM
772*8462SApril.Chin@Sun.COM		integer files_without_known_format=0
773*8462SApril.Chin@Sun.COM
774*8462SApril.Chin@Sun.COM		integer files_with_license_info=0
775*8462SApril.Chin@Sun.COM		integer files_without_license_info=0
776*8462SApril.Chin@Sun.COM
777*8462SApril.Chin@Sun.COM		integer total_num_files=0
778*8462SApril.Chin@Sun.COM	)
779*8462SApril.Chin@Sun.COM
780*8462SApril.Chin@Sun.COM	for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
781*8462SApril.Chin@Sun.COM		if "${records[$i].comments_parsed}" ; then
782*8462SApril.Chin@Sun.COM			(( stats.files_with_comments++ ))
783*8462SApril.Chin@Sun.COM		else
784*8462SApril.Chin@Sun.COM			(( stats.files_without_comments++ ))
785*8462SApril.Chin@Sun.COM		fi
786*8462SApril.Chin@Sun.COM
787*8462SApril.Chin@Sun.COM		if ! "${records[$i].fileformat_found}" ; then
788*8462SApril.Chin@Sun.COM			(( stats.files_without_known_format++ ))
789*8462SApril.Chin@Sun.COM		fi
790*8462SApril.Chin@Sun.COM
791*8462SApril.Chin@Sun.COM		if "${records[$i].license_info_found}" ; then
792*8462SApril.Chin@Sun.COM			(( stats.files_with_license_info++ ))
793*8462SApril.Chin@Sun.COM		else
794*8462SApril.Chin@Sun.COM			(( stats.files_without_license_info++ ))
795*8462SApril.Chin@Sun.COM		fi
796*8462SApril.Chin@Sun.COM
797*8462SApril.Chin@Sun.COM		(( stats.total_num_files++ ))
798*8462SApril.Chin@Sun.COM	done
799*8462SApril.Chin@Sun.COM
800*8462SApril.Chin@Sun.COM	printf "%B\n" stats
801*8462SApril.Chin@Sun.COM	return 0
802*8462SApril.Chin@Sun.COM}
803*8462SApril.Chin@Sun.COM
804*8462SApril.Chin@Sun.COM
805*8462SApril.Chin@Sun.COMfunction print_comments_plain
806*8462SApril.Chin@Sun.COM{
807*8462SApril.Chin@Sun.COM	set -o errexit
808*8462SApril.Chin@Sun.COM
809*8462SApril.Chin@Sun.COM	nameref records=$1
810*8462SApril.Chin@Sun.COM	nameref options=$2
811*8462SApril.Chin@Sun.COM	typeset i j
812*8462SApril.Chin@Sun.COM
813*8462SApril.Chin@Sun.COM	for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
814*8462SApril.Chin@Sun.COM		nameref node=records[$i]
815*8462SApril.Chin@Sun.COM
816*8462SApril.Chin@Sun.COM		if [[ "${options.filepattern.accept}" != "" ]] && \
817*8462SApril.Chin@Sun.COM		   [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then
818*8462SApril.Chin@Sun.COM			continue
819*8462SApril.Chin@Sun.COM		fi
820*8462SApril.Chin@Sun.COM		if [[ "${options.filepattern.reject}" != "" ]] && \
821*8462SApril.Chin@Sun.COM		   [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then
822*8462SApril.Chin@Sun.COM			continue
823*8462SApril.Chin@Sun.COM		fi
824*8462SApril.Chin@Sun.COM
825*8462SApril.Chin@Sun.COM		node.license_info_found=false
826*8462SApril.Chin@Sun.COM
827*8462SApril.Chin@Sun.COM		if ! "${node.comments_parsed}" ; then
828*8462SApril.Chin@Sun.COM			continue
829*8462SApril.Chin@Sun.COM		fi
830*8462SApril.Chin@Sun.COM
831*8462SApril.Chin@Sun.COM		for j in "${!node.comments[@]}" ; do
832*8462SApril.Chin@Sun.COM			typeset s="${node.comments[$j]}"
833*8462SApril.Chin@Sun.COM			typeset match=false
834*8462SApril.Chin@Sun.COM
835*8462SApril.Chin@Sun.COM			if [[ "${options.commentpattern.accept}" != "" ]] && \
836*8462SApril.Chin@Sun.COM		   	   [[ "$s" == ${options.commentpattern.accept} ]] ; then
837*8462SApril.Chin@Sun.COM				match=true
838*8462SApril.Chin@Sun.COM			fi
839*8462SApril.Chin@Sun.COM			if [[ "${options.commentpattern.reject}" != "" ]] && \
840*8462SApril.Chin@Sun.COM	  		   [[ "$s" == ${options.commentpattern.reject} ]] ; then
841*8462SApril.Chin@Sun.COM				match=false
842*8462SApril.Chin@Sun.COM			fi
843*8462SApril.Chin@Sun.COM
844*8462SApril.Chin@Sun.COM			if "${match}" ; then
845*8462SApril.Chin@Sun.COM				printf "\f#### filename='%s',\tcomment=%s\n" "${node.filename}" "$j"
846*8462SApril.Chin@Sun.COM				printf "%s\n" "$s"
847*8462SApril.Chin@Sun.COM				node.license_info_found=true
848*8462SApril.Chin@Sun.COM			fi
849*8462SApril.Chin@Sun.COM		done
850*8462SApril.Chin@Sun.COM
851*8462SApril.Chin@Sun.COM		if ! "${node.license_info_found}" ; then
852*8462SApril.Chin@Sun.COM			printf "## no match found in '%s'," "${node.filename}"
853*8462SApril.Chin@Sun.COM			printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \
854*8462SApril.Chin@Sun.COM				"${node.comments_parsed}" \
855*8462SApril.Chin@Sun.COM				"${node.fileformat_found}" \
856*8462SApril.Chin@Sun.COM				"${node.file_format}"
857*8462SApril.Chin@Sun.COM		fi
858*8462SApril.Chin@Sun.COM	done
859*8462SApril.Chin@Sun.COM
860*8462SApril.Chin@Sun.COM	return 0
861*8462SApril.Chin@Sun.COM}
862*8462SApril.Chin@Sun.COM
863*8462SApril.Chin@Sun.COMfunction print_comments_duplicates_compressed
864*8462SApril.Chin@Sun.COM{
865*8462SApril.Chin@Sun.COM	set -o errexit
866*8462SApril.Chin@Sun.COM
867*8462SApril.Chin@Sun.COM	nameref records=$1
868*8462SApril.Chin@Sun.COM	nameref options=$2
869*8462SApril.Chin@Sun.COM	typeset i j
870*8462SApril.Chin@Sun.COM	typeset -A hashed_comments
871*8462SApril.Chin@Sun.COM	integer num_hashed_comments
872*8462SApril.Chin@Sun.COM
873*8462SApril.Chin@Sun.COM	for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
874*8462SApril.Chin@Sun.COM		nameref node=records[$i]
875*8462SApril.Chin@Sun.COM
876*8462SApril.Chin@Sun.COM		if [[ "${options.filepattern.accept}" != "" ]] && \
877*8462SApril.Chin@Sun.COM		   [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then
878*8462SApril.Chin@Sun.COM			continue
879*8462SApril.Chin@Sun.COM		fi
880*8462SApril.Chin@Sun.COM		if [[ "${options.filepattern.reject}" != "" ]] && \
881*8462SApril.Chin@Sun.COM		   [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then
882*8462SApril.Chin@Sun.COM			continue
883*8462SApril.Chin@Sun.COM		fi
884*8462SApril.Chin@Sun.COM
885*8462SApril.Chin@Sun.COM		node.license_info_found=false
886*8462SApril.Chin@Sun.COM
887*8462SApril.Chin@Sun.COM		if ! "${node.comments_parsed}" ; then
888*8462SApril.Chin@Sun.COM			continue
889*8462SApril.Chin@Sun.COM		fi
890*8462SApril.Chin@Sun.COM
891*8462SApril.Chin@Sun.COM		for j in "${!node.comments[@]}" ; do
892*8462SApril.Chin@Sun.COM			typeset s="${node.comments[$j]}"
893*8462SApril.Chin@Sun.COM			typeset match=false
894*8462SApril.Chin@Sun.COM
895*8462SApril.Chin@Sun.COM			if [[ "${options.commentpattern.accept}" != "" ]] && \
896*8462SApril.Chin@Sun.COM		   	   [[ "$s" == ${options.commentpattern.accept} ]] ; then
897*8462SApril.Chin@Sun.COM				match=true
898*8462SApril.Chin@Sun.COM			fi
899*8462SApril.Chin@Sun.COM			if [[ "${options.commentpattern.reject}" != "" ]] && \
900*8462SApril.Chin@Sun.COM	  		   [[ "$s" == ${options.commentpattern.reject} ]] ; then
901*8462SApril.Chin@Sun.COM				match=false
902*8462SApril.Chin@Sun.COM			fi
903*8462SApril.Chin@Sun.COM
904*8462SApril.Chin@Sun.COM
905*8462SApril.Chin@Sun.COM			if "${match}" ; then
906*8462SApril.Chin@Sun.COM				typeset -l hashstring # lowercase
907*8462SApril.Chin@Sun.COM
908*8462SApril.Chin@Sun.COM				# compress the comment (e.g. convert whiteapces and '.,:;()"' to newline characters) ...
909*8462SApril.Chin@Sun.COM				hashstring="${s//+([\n\r\t\v*#.,:;\(\)\"[:space:][:blank:]])/${ch.newline}}"
910*8462SApril.Chin@Sun.COM				# ... and then create a MD5 hash from this string
911*8462SApril.Chin@Sun.COM				hash="$(sum -x md5 <<<"${hashstring}")"
912*8462SApril.Chin@Sun.COM
913*8462SApril.Chin@Sun.COM				nameref hc_node=hashed_comments[${hash}]
914*8462SApril.Chin@Sun.COM
915*8462SApril.Chin@Sun.COM				if [[ "${hc_node}" == "" ]] ; then
916*8462SApril.Chin@Sun.COM					# build node if there isn't one yet
917*8462SApril.Chin@Sun.COM					typeset -a hc_node.fileids
918*8462SApril.Chin@Sun.COM					typeset    hc_node.comment="$s"
919*8462SApril.Chin@Sun.COM				fi
920*8462SApril.Chin@Sun.COM
921*8462SApril.Chin@Sun.COM				hc_node.fileids+=( "$(printf "%s (md5='%s', sha1='%s')\n" "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}")" )
922*8462SApril.Chin@Sun.COM
923*8462SApril.Chin@Sun.COM				node.license_info_found=true
924*8462SApril.Chin@Sun.COM			fi
925*8462SApril.Chin@Sun.COM		done
926*8462SApril.Chin@Sun.COM
927*8462SApril.Chin@Sun.COM		if ! "${node.license_info_found}" ; then
928*8462SApril.Chin@Sun.COM			printf "## no match found in "
929*8462SApril.Chin@Sun.COM			printf "%s (md5='%s', sha1='%s'), " "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}"
930*8462SApril.Chin@Sun.COM			printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \
931*8462SApril.Chin@Sun.COM				"${node.comments_parsed}" \
932*8462SApril.Chin@Sun.COM				"${node.fileformat_found}" \
933*8462SApril.Chin@Sun.COM				"${node.file_format}"
934*8462SApril.Chin@Sun.COM		fi
935*8462SApril.Chin@Sun.COM	done
936*8462SApril.Chin@Sun.COM
937*8462SApril.Chin@Sun.COM	# print comments and all fileids (filename+hash sums) which include this comment
938*8462SApril.Chin@Sun.COM	for i in "${!hashed_comments[@]}" ; do
939*8462SApril.Chin@Sun.COM		printf "\f## The comment (ID=%s) ..." "${i}"
940*8462SApril.Chin@Sun.COM		printf "\n-- snip --"
941*8462SApril.Chin@Sun.COM		printf "\n%s" "${hashed_comments[${i}].comment}"
942*8462SApril.Chin@Sun.COM		printf "\n-- snip --"
943*8462SApril.Chin@Sun.COM		printf "\n... applies to the following files:\n"
944*8462SApril.Chin@Sun.COM		printf "\t%s\n" "${hashed_comments[${i}].fileids[@]}" # printf repeats the format string for each array memeber
945*8462SApril.Chin@Sun.COM	done
946*8462SApril.Chin@Sun.COM
947*8462SApril.Chin@Sun.COM	return 0
948*8462SApril.Chin@Sun.COM}
949*8462SApril.Chin@Sun.COM
950*8462SApril.Chin@Sun.COMfunction do_crawl
951*8462SApril.Chin@Sun.COM{
952*8462SApril.Chin@Sun.COM	set -o errexit
953*8462SApril.Chin@Sun.COM
954*8462SApril.Chin@Sun.COM	typeset options=(
955*8462SApril.Chin@Sun.COM		integer max_filesize_for_scan=$((256*1024))
956*8462SApril.Chin@Sun.COM		integer max_num_comments=$((2**62)) # FIXME: This should be "+Inf" (=Infinite)
957*8462SApril.Chin@Sun.COM	)
958*8462SApril.Chin@Sun.COM
959*8462SApril.Chin@Sun.COM	shift
960*8462SApril.Chin@Sun.COM	while getopts -a "${progname}" "${do_crawl_usage}" OPT "$@" ; do
961*8462SApril.Chin@Sun.COM		printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
962*8462SApril.Chin@Sun.COM		case ${OPT} in
963*8462SApril.Chin@Sun.COM			S)	options.max_filesize_for_scan="${OPTARG}"  ;;
964*8462SApril.Chin@Sun.COM			N)	options.max_num_comments="${OPTARG}"  ;;
965*8462SApril.Chin@Sun.COM			*)	usage do_crawl_usage ;;
966*8462SApril.Chin@Sun.COM		esac
967*8462SApril.Chin@Sun.COM	done
968*8462SApril.Chin@Sun.COM	shift $((OPTIND-1))
969*8462SApril.Chin@Sun.COM
970*8462SApril.Chin@Sun.COM	typeset scan=(
971*8462SApril.Chin@Sun.COM		typeset -A records
972*8462SApril.Chin@Sun.COM	)
973*8462SApril.Chin@Sun.COM
974*8462SApril.Chin@Sun.COM	# read filenames from stdin
975*8462SApril.Chin@Sun.COM	while read i ; do
976*8462SApril.Chin@Sun.COM		printf "## scanning %s ...\n" "$i"
977*8462SApril.Chin@Sun.COM		extract_comments scan.records "$i" ${options.max_num_comments} ${options.max_filesize_for_scan} || true
978*8462SApril.Chin@Sun.COM	done
979*8462SApril.Chin@Sun.COM
980*8462SApril.Chin@Sun.COM	# print compound variable array (we strip the "typeset -A records" for now)
981*8462SApril.Chin@Sun.COM	printf "%B\n" scan |
982*8462SApril.Chin@Sun.COM		sed $'s/^#.*$//;s/^\(//;s/^\)//;s/^\ttypeset -A records=\(//;s/^\t\)//' >"crawlsrccomments_extracted_comments.cpv"
983*8462SApril.Chin@Sun.COM
984*8462SApril.Chin@Sun.COM	print "# Wrote results to crawlsrccomments_extracted_comments.cpv"
985*8462SApril.Chin@Sun.COM
986*8462SApril.Chin@Sun.COM	return 0
987*8462SApril.Chin@Sun.COM}
988*8462SApril.Chin@Sun.COM
989*8462SApril.Chin@Sun.COMfunction do_getcomments
990*8462SApril.Chin@Sun.COM{
991*8462SApril.Chin@Sun.COM	set -o errexit
992*8462SApril.Chin@Sun.COM
993*8462SApril.Chin@Sun.COM	# vars
994*8462SApril.Chin@Sun.COM	typeset scan=(
995*8462SApril.Chin@Sun.COM		typeset -A records
996*8462SApril.Chin@Sun.COM	)
997*8462SApril.Chin@Sun.COM	typeset database
998*8462SApril.Chin@Sun.COM	typeset tmp
999*8462SApril.Chin@Sun.COM
1000*8462SApril.Chin@Sun.COM	typeset options=(
1001*8462SApril.Chin@Sun.COM		typeset database="crawlsrccomments_extracted_comments.cpv"
1002*8462SApril.Chin@Sun.COM
1003*8462SApril.Chin@Sun.COM		typeset print_stats=false
1004*8462SApril.Chin@Sun.COM		typeset zapduplicates=false
1005*8462SApril.Chin@Sun.COM		typeset filepattern=(
1006*8462SApril.Chin@Sun.COM			typeset accept="*"
1007*8462SApril.Chin@Sun.COM			typeset reject=""
1008*8462SApril.Chin@Sun.COM		)
1009*8462SApril.Chin@Sun.COM		typeset commentpattern=(
1010*8462SApril.Chin@Sun.COM			typeset accept="~(Ei)(license|copyright)"
1011*8462SApril.Chin@Sun.COM			typeset reject=""
1012*8462SApril.Chin@Sun.COM		)
1013*8462SApril.Chin@Sun.COM	)
1014*8462SApril.Chin@Sun.COM
1015*8462SApril.Chin@Sun.COM	shift
1016*8462SApril.Chin@Sun.COM	while getopts -a "${progname}" "${do_getcomments_usage}" OPT "$@" ; do
1017*8462SApril.Chin@Sun.COM	#    printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
1018*8462SApril.Chin@Sun.COM		case ${OPT} in
1019*8462SApril.Chin@Sun.COM			c)	options.commentpattern.accept="${OPTARG}" ;;
1020*8462SApril.Chin@Sun.COM			C)	options.commentpattern.reject="${OPTARG}" ;;
1021*8462SApril.Chin@Sun.COM			D)	options.database="${OPTARG}" ;;
1022*8462SApril.Chin@Sun.COM			l)	options.filepattern.accept="${OPTARG}" ;;
1023*8462SApril.Chin@Sun.COM			L)	options.filepattern.reject="${OPTARG}" ;;
1024*8462SApril.Chin@Sun.COM			S)	options.print_stats=true ;;
1025*8462SApril.Chin@Sun.COM			+S)	options.print_stats=false ;;
1026*8462SApril.Chin@Sun.COM			Z)	options.zapduplicates=true ;;
1027*8462SApril.Chin@Sun.COM			+Z)	options.zapduplicates=false ;;
1028*8462SApril.Chin@Sun.COM			*)	usage do_getcomments_usage ;;
1029*8462SApril.Chin@Sun.COM		esac
1030*8462SApril.Chin@Sun.COM	done
1031*8462SApril.Chin@Sun.COM	shift $((OPTIND-1))
1032*8462SApril.Chin@Sun.COM
1033*8462SApril.Chin@Sun.COM	# array of temporary files which should be cleaned-up upon exit
1034*8462SApril.Chin@Sun.COM	typeset -a tmpfiles
1035*8462SApril.Chin@Sun.COM	trap 'set -o errexit ; print -u2 "# Cleaning up..." ; ((${#tmpfiles[@]} > 0)) && rm -- "${tmpfiles[@]}" ; print -u2 "# Done."' EXIT
1036*8462SApril.Chin@Sun.COM
1037*8462SApril.Chin@Sun.COM	# Support for HTTP URLs
1038*8462SApril.Chin@Sun.COM	if [[ "${options.database}" == ~(El)http://.* ]] ; then
1039*8462SApril.Chin@Sun.COM		database="/tmp/extract_license_cat_http_${PPID}_$$.tmp"
1040*8462SApril.Chin@Sun.COM		tmpfiles+=( "${database}" )
1041*8462SApril.Chin@Sun.COM		print -u2 "# Loading URL..."
1042*8462SApril.Chin@Sun.COM		cat_http "${options.database}" >"${database}"
1043*8462SApril.Chin@Sun.COM		print -u2 "# Loading URL done."
1044*8462SApril.Chin@Sun.COM	else
1045*8462SApril.Chin@Sun.COM		database="${options.database}"
1046*8462SApril.Chin@Sun.COM	fi
1047*8462SApril.Chin@Sun.COM
1048*8462SApril.Chin@Sun.COM	if [[ ! -r "${database}" ]] ; then
1049*8462SApril.Chin@Sun.COM		fatal_error "Can't read ${database}."
1050*8462SApril.Chin@Sun.COM	fi
1051*8462SApril.Chin@Sun.COM
1052*8462SApril.Chin@Sun.COM	# Support for compressed database files
1053*8462SApril.Chin@Sun.COM	case "$(LC_ALL=C /usr/bin/file "${database}")" in
1054*8462SApril.Chin@Sun.COM		*bzip2*)
1055*8462SApril.Chin@Sun.COM			tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp"
1056*8462SApril.Chin@Sun.COM			tmpfiles+=( "${tmp}" )
1057*8462SApril.Chin@Sun.COM			print -u2 "# Uncompressing data (bzip2) ..."
1058*8462SApril.Chin@Sun.COM			bzcat <"${database}" >"${tmp}"
1059*8462SApril.Chin@Sun.COM			print -u2 "# Uncompression done."
1060*8462SApril.Chin@Sun.COM			database="${tmp}"
1061*8462SApril.Chin@Sun.COM			;;
1062*8462SApril.Chin@Sun.COM		*gzip*)
1063*8462SApril.Chin@Sun.COM			tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp"
1064*8462SApril.Chin@Sun.COM			tmpfiles+=( "${tmp}" )
1065*8462SApril.Chin@Sun.COM			print -u2 "# Uncompressing data (gzip) ..."
1066*8462SApril.Chin@Sun.COM			gunzip -c <"${database}" >"${tmp}"
1067*8462SApril.Chin@Sun.COM			print -u2 "# Uncompression done."
1068*8462SApril.Chin@Sun.COM			database="${tmp}"
1069*8462SApril.Chin@Sun.COM			;;
1070*8462SApril.Chin@Sun.COM	esac
1071*8462SApril.Chin@Sun.COM
1072*8462SApril.Chin@Sun.COM	# Read compound variable which contain all recorded comments
1073*8462SApril.Chin@Sun.COM	print -u2 "# reading records..."
1074*8462SApril.Chin@Sun.COM	{
1075*8462SApril.Chin@Sun.COM		printf "("
1076*8462SApril.Chin@Sun.COM		cat "${database}"
1077*8462SApril.Chin@Sun.COM		printf ")\n"
1078*8462SApril.Chin@Sun.COM	} | read -C scan.records || fatal_error 'Error reading data.'
1079*8462SApril.Chin@Sun.COM	print -u2 -f "# reading %d records done.\n" "${#scan.records[@]}"
1080*8462SApril.Chin@Sun.COM
1081*8462SApril.Chin@Sun.COM	# print comments
1082*8462SApril.Chin@Sun.COM	print -u2 "# processing data..."
1083*8462SApril.Chin@Sun.COM	print "## comments start:"
1084*8462SApril.Chin@Sun.COM	if "${options.zapduplicates}" ; then
1085*8462SApril.Chin@Sun.COM		print_comments_duplicates_compressed scan.records options
1086*8462SApril.Chin@Sun.COM	else
1087*8462SApril.Chin@Sun.COM		print_comments_plain scan.records options
1088*8462SApril.Chin@Sun.COM	fi
1089*8462SApril.Chin@Sun.COM	print "## comments end"
1090*8462SApril.Chin@Sun.COM	print -u2 "# processing data done."
1091*8462SApril.Chin@Sun.COM
1092*8462SApril.Chin@Sun.COM	if "${options.print_stats}" ; then
1093*8462SApril.Chin@Sun.COM		print_stats
1094*8462SApril.Chin@Sun.COM	fi
1095*8462SApril.Chin@Sun.COM
1096*8462SApril.Chin@Sun.COM	return 0
1097*8462SApril.Chin@Sun.COM}
1098*8462SApril.Chin@Sun.COM
1099*8462SApril.Chin@Sun.COMfunction usage
1100*8462SApril.Chin@Sun.COM{
1101*8462SApril.Chin@Sun.COM	nameref usagemsg=$1
1102*8462SApril.Chin@Sun.COM	OPTIND=0
1103*8462SApril.Chin@Sun.COM	getopts -a "${progname}" "${usagemsg}" OPT '-?'
1104*8462SApril.Chin@Sun.COM	exit 2
1105*8462SApril.Chin@Sun.COM}
1106*8462SApril.Chin@Sun.COM
1107*8462SApril.Chin@Sun.COMtypeset -r do_getcomments_usage=$'+
1108*8462SApril.Chin@Sun.COM[-?\n@(#)\$Id: getcomments (Roland Mainz) 2008-10-14 \$\n]
1109*8462SApril.Chin@Sun.COM[-author?Roland Mainz <roland.mainz@sun.com>]
1110*8462SApril.Chin@Sun.COM[+NAME?getcomments - extract license information from source files]
1111*8462SApril.Chin@Sun.COM[+DESCRIPTION?\bgetcomments\b is a small utilty script which extracts
1112*8462SApril.Chin@Sun.COM	license information from the "\bgetcomments\b"-database
1113*8462SApril.Chin@Sun.COM	file created by \bcrawl\b. The script allows various
1114*8462SApril.Chin@Sun.COM	filters (see options below) to be applied on the database]
1115*8462SApril.Chin@Sun.COM[+?The license extraction is done in two steps - first a crawler script
1116*8462SApril.Chin@Sun.COM	called \bcrawl\b will scan all source files, extract
1117*8462SApril.Chin@Sun.COM	the comments and stores this information in a "database" file called
1118*8462SApril.Chin@Sun.COM	"crawlsrccomments_extracted_comments.cpv" and then \bextract_license\b allows
1119*8462SApril.Chin@Sun.COM	queries on this database.]
1120*8462SApril.Chin@Sun.COM[D:database?Database file for input (either file or http://-URL).]:[database]
1121*8462SApril.Chin@Sun.COM[l:acceptfilepattern?Process only files which match pattern.]:[pattern]
1122*8462SApril.Chin@Sun.COM[L:rejectfilepattern?Process only files which do not match pattern.]:[pattern]
1123*8462SApril.Chin@Sun.COM[c:acceptcommentpattern?Match comments which match pattern. Defaults to ~(Ei)(license|copyright)]:[pattern]
1124*8462SApril.Chin@Sun.COM[C:rejectcommentpattern?Discard comments which match pattern. Defaults to ""]:[pattern]
1125*8462SApril.Chin@Sun.COM[S:stats?Print statistics.]
1126*8462SApril.Chin@Sun.COM[Z:zapsimilar?Combine similar/duplicate comments in the report.]
1127*8462SApril.Chin@Sun.COM[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
1128*8462SApril.Chin@Sun.COM'
1129*8462SApril.Chin@Sun.COM
1130*8462SApril.Chin@Sun.COMtypeset -r do_crawl_usage=$'+
1131*8462SApril.Chin@Sun.COM[-?\n@(#)\$Id: crawl (Roland Mainz) 2008-10-14 \$\n]
1132*8462SApril.Chin@Sun.COM[-author?Roland Mainz <roland.mainz@sun.com>]
1133*8462SApril.Chin@Sun.COM[+NAME?crawl - crawl comment information from source files]
1134*8462SApril.Chin@Sun.COM[+DESCRIPTION?\bcrawl\b is a small utilty script which reads
1135*8462SApril.Chin@Sun.COM	a list of source code files from stdin, determinates the type of
1136*8462SApril.Chin@Sun.COM	syntax used by these files and then extracts
1137*8462SApril.Chin@Sun.COM	comments from the source code and stores this information into a
1138*8462SApril.Chin@Sun.COM	"database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then
1139*8462SApril.Chin@Sun.COM	be processed by \bextract_license\b or similar processing tools.]
1140*8462SApril.Chin@Sun.COM[S:scanmaxcharacters?Scan a maximum number of numchars characters for comments.
1141*8462SApril.Chin@Sun.COM	Defaults to 256K characters.]:[numchars]
1142*8462SApril.Chin@Sun.COM[N:maxnumcomments?Maximum numbers of comments to crawl. Defaults to "+Infinite"]:[numcomments]
1143*8462SApril.Chin@Sun.COM[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
1144*8462SApril.Chin@Sun.COM'
1145*8462SApril.Chin@Sun.COM
1146*8462SApril.Chin@Sun.COMtypeset -r crawlsrccomments_usage=$'+
1147*8462SApril.Chin@Sun.COM[-?\n@(#)\$Id: crawlsrccomments (Roland Mainz) 2008-10-14 \$\n]
1148*8462SApril.Chin@Sun.COM[-author?Roland Mainz <roland.mainz@sun.com>]
1149*8462SApril.Chin@Sun.COM[+NAME?crawlsrccomments - extract and filter comment information from source files]
1150*8462SApril.Chin@Sun.COM[+DESCRIPTION?\bcrawlsrccomments\b is a small utilty script which reads
1151*8462SApril.Chin@Sun.COM	a list of source code files from stdin, determinates the type of
1152*8462SApril.Chin@Sun.COM	syntax used by these files and then extracts
1153*8462SApril.Chin@Sun.COM	comments from the source code and stores this information into a
1154*8462SApril.Chin@Sun.COM	"database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then
1155*8462SApril.Chin@Sun.COM	be processed by \bextract_license\b or similar processing tools.]
1156*8462SApril.Chin@Sun.COM
1157*8462SApril.Chin@Sun.COM[crawl|getcomments] options
1158*8462SApril.Chin@Sun.COM
1159*8462SApril.Chin@Sun.COM[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
1160*8462SApril.Chin@Sun.COM'
1161*8462SApril.Chin@Sun.COM
1162*8462SApril.Chin@Sun.COM
1163*8462SApril.Chin@Sun.COM# program start
1164*8462SApril.Chin@Sun.COMbuiltin basename
1165*8462SApril.Chin@Sun.COMbuiltin cat
1166*8462SApril.Chin@Sun.COMbuiltin date
1167*8462SApril.Chin@Sun.COMbuiltin uname
1168*8462SApril.Chin@Sun.COMbuiltin rm
1169*8462SApril.Chin@Sun.COMbuiltin sum || fatal_error "sum builtin not found."
1170*8462SApril.Chin@Sun.COM
1171*8462SApril.Chin@Sun.COM# exit at the first error we hit
1172*8462SApril.Chin@Sun.COMset -o errexit
1173*8462SApril.Chin@Sun.COM
1174*8462SApril.Chin@Sun.COMtypeset progname="${ basename "${0}" ; }"
1175*8462SApril.Chin@Sun.COM
1176*8462SApril.Chin@Sun.COMwhile getopts -a "${progname}" "${crawlsrccomments_usage}" OPT ; do
1177*8462SApril.Chin@Sun.COM	# printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
1178*8462SApril.Chin@Sun.COM	case ${OPT} in
1179*8462SApril.Chin@Sun.COM		*)	usage crawlsrccomments_usage ;;
1180*8462SApril.Chin@Sun.COM	esac
1181*8462SApril.Chin@Sun.COMdone
1182*8462SApril.Chin@Sun.COMshift $((OPTIND-1))
1183*8462SApril.Chin@Sun.COM
1184*8462SApril.Chin@Sun.COMtypeset cmd="$1"
1185*8462SApril.Chin@Sun.COM
1186*8462SApril.Chin@Sun.COMcase "$cmd" in
1187*8462SApril.Chin@Sun.COM	"crawl")
1188*8462SApril.Chin@Sun.COM		progname+=" ${cmd}"
1189*8462SApril.Chin@Sun.COM		do_crawl "$@"
1190*8462SApril.Chin@Sun.COM		exit $?
1191*8462SApril.Chin@Sun.COM		;;
1192*8462SApril.Chin@Sun.COM	"getcomments")
1193*8462SApril.Chin@Sun.COM		progname+=" ${cmd}"
1194*8462SApril.Chin@Sun.COM		do_getcomments "$@"
1195*8462SApril.Chin@Sun.COM		exit $?
1196*8462SApril.Chin@Sun.COM		;;
1197*8462SApril.Chin@Sun.COM	*)
1198*8462SApril.Chin@Sun.COM		usage crawlsrccomments_usage
1199*8462SApril.Chin@Sun.COM		;;
1200*8462SApril.Chin@Sun.COMesac
1201*8462SApril.Chin@Sun.COM
1202*8462SApril.Chin@Sun.COMfatal_error "not reached."
1203*8462SApril.Chin@Sun.COM# EOF.
1204