xref: /onnv-gate/usr/src/lib/libshell/common/scripts/crawlsrccomments.sh (revision 12068:08a39a083754)
18462SApril.Chin@Sun.COM#!/usr/bin/ksh93
28462SApril.Chin@Sun.COM
38462SApril.Chin@Sun.COM#
48462SApril.Chin@Sun.COM# CDDL HEADER START
58462SApril.Chin@Sun.COM#
68462SApril.Chin@Sun.COM# The contents of this file are subject to the terms of the
78462SApril.Chin@Sun.COM# Common Development and Distribution License (the "License").
88462SApril.Chin@Sun.COM# You may not use this file except in compliance with the License.
98462SApril.Chin@Sun.COM#
108462SApril.Chin@Sun.COM# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
118462SApril.Chin@Sun.COM# or http://www.opensolaris.org/os/licensing.
128462SApril.Chin@Sun.COM# See the License for the specific language governing permissions
138462SApril.Chin@Sun.COM# and limitations under the License.
148462SApril.Chin@Sun.COM#
158462SApril.Chin@Sun.COM# When distributing Covered Code, include this CDDL HEADER in each
168462SApril.Chin@Sun.COM# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
178462SApril.Chin@Sun.COM# If applicable, add the following below this CDDL HEADER, with the
188462SApril.Chin@Sun.COM# fields enclosed by brackets "[]" replaced with your own identifying
198462SApril.Chin@Sun.COM# information: Portions Copyright [yyyy] [name of copyright owner]
208462SApril.Chin@Sun.COM#
218462SApril.Chin@Sun.COM# CDDL HEADER END
228462SApril.Chin@Sun.COM#
238462SApril.Chin@Sun.COM
248462SApril.Chin@Sun.COM#
25*12068SRoger.Faulkner@Oracle.COM# Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
268462SApril.Chin@Sun.COM#
278462SApril.Chin@Sun.COM
288462SApril.Chin@Sun.COM# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant
298462SApril.Chin@Sun.COMexport PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin
308462SApril.Chin@Sun.COM
318462SApril.Chin@Sun.COM# Make sure all math stuff runs in the "C" locale to avoid problems
328462SApril.Chin@Sun.COM# with alternative # radix point representations (e.g. ',' instead of
338462SApril.Chin@Sun.COM# '.' in de_DE.*-locales). This needs to be set _before_ any
348462SApril.Chin@Sun.COM# floating-point constants are defined in this script).
358462SApril.Chin@Sun.COMif [[ "${LC_ALL}" != "" ]] ; then
368462SApril.Chin@Sun.COM    export \
378462SApril.Chin@Sun.COM        LC_MONETARY="${LC_ALL}" \
388462SApril.Chin@Sun.COM        LC_MESSAGES="${LC_ALL}" \
398462SApril.Chin@Sun.COM        LC_COLLATE="${LC_ALL}" \
408462SApril.Chin@Sun.COM        LC_CTYPE="${LC_ALL}"
418462SApril.Chin@Sun.COM        unset LC_ALL
428462SApril.Chin@Sun.COMfi
438462SApril.Chin@Sun.COMexport LC_NUMERIC=C
448462SApril.Chin@Sun.COM
458462SApril.Chin@Sun.COM# constants values for tokenizer/parser stuff
4610898Sroland.mainz@nrubsig.orgcompound -r ch=(
478462SApril.Chin@Sun.COM	newline=$'\n'
488462SApril.Chin@Sun.COM	tab=$'\t'
498462SApril.Chin@Sun.COM	formfeed=$'\f'
508462SApril.Chin@Sun.COM)
518462SApril.Chin@Sun.COM
528462SApril.Chin@Sun.COMfunction fatal_error
538462SApril.Chin@Sun.COM{
548462SApril.Chin@Sun.COM	print -u2 "${progname}: $*"
558462SApril.Chin@Sun.COM	exit 1
568462SApril.Chin@Sun.COM}
578462SApril.Chin@Sun.COM
588462SApril.Chin@Sun.COMfunction printmsg
598462SApril.Chin@Sun.COM{
608462SApril.Chin@Sun.COM	print -u2 "$*"
618462SApril.Chin@Sun.COM}
628462SApril.Chin@Sun.COM
638462SApril.Chin@Sun.COM
648462SApril.Chin@Sun.COMfunction attrstrtoattrarray
658462SApril.Chin@Sun.COM{
668462SApril.Chin@Sun.COM#set -o xtrace
678462SApril.Chin@Sun.COM    typeset s="$1"
688462SApril.Chin@Sun.COM    nameref aa=$2 # attribute array
698462SApril.Chin@Sun.COM    integer aa_count=0
708462SApril.Chin@Sun.COM    integer aa_count=0
718462SApril.Chin@Sun.COM    typeset nextattr
728462SApril.Chin@Sun.COM    integer currattrlen=0
738462SApril.Chin@Sun.COM    typeset tagstr
748462SApril.Chin@Sun.COM    typeset tagval
758462SApril.Chin@Sun.COM
768462SApril.Chin@Sun.COM    while (( ${#s} > 0 )) ; do
778462SApril.Chin@Sun.COM        # skip whitespaces
788462SApril.Chin@Sun.COM        while [[ "${s:currattrlen:1}" == ~(E)[[:blank:][:space:]] ]] ; do
798462SApril.Chin@Sun.COM            (( currattrlen++ ))
808462SApril.Chin@Sun.COM        done
818462SApril.Chin@Sun.COM        s="${s:currattrlen:${#s}}"
828462SApril.Chin@Sun.COM
838462SApril.Chin@Sun.COM        # anything left ?
848462SApril.Chin@Sun.COM        (( ${#s} == 0 )) && break
858462SApril.Chin@Sun.COM
868462SApril.Chin@Sun.COM        # Pattern tests:
878462SApril.Chin@Sun.COM        #x="foo=bar huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=[^[:blank:]\"]*}"
888462SApril.Chin@Sun.COM        #x='foo="ba=r o" huz=123' ; print "${x##~(E)[[:alnum:]_-:]*=\"[^\"]*\"}"
898462SApril.Chin@Sun.COM        #x="foo='ba=r o' huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=\'[^\"]*\'}"
908462SApril.Chin@Sun.COM        #x="foox huz=123" ; print "${x##~(E)[[:alnum:]_-:]*}"
918462SApril.Chin@Sun.COM        # All pattern combined via eregex (w|x|y|z):
928462SApril.Chin@Sun.COM        #x='foo="bar=o" huz=123' ; print "${x##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\')}"
938462SApril.Chin@Sun.COM        nextattr="${s##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\'|[[:alnum:]_-:]*)}"
948462SApril.Chin@Sun.COM        currattrlen=$(( ${#s} - ${#nextattr}))
958462SApril.Chin@Sun.COM
968462SApril.Chin@Sun.COM        # add entry
978462SApril.Chin@Sun.COM        tagstr="${s:0:currattrlen}"
988462SApril.Chin@Sun.COM        if [[ "${tagstr}" == *=* ]] ; then
998462SApril.Chin@Sun.COM            # normal case: attribute with value
1008462SApril.Chin@Sun.COM
1018462SApril.Chin@Sun.COM            tagval="${tagstr#*=}"
1028462SApril.Chin@Sun.COM
1038462SApril.Chin@Sun.COM            # strip quotes ('' or "")
1048462SApril.Chin@Sun.COM            if [[ "${tagval}" == ~(Elr)(\'.*\'|\".*\") ]] ; then
1058462SApril.Chin@Sun.COM                tagval="${tagval:1:${#tagval}-2}"
1068462SApril.Chin@Sun.COM            fi
1078462SApril.Chin@Sun.COM
1088462SApril.Chin@Sun.COM            aa[${aa_count}]=( name="${tagstr%%=*}" value="${tagval}" )
1098462SApril.Chin@Sun.COM        else
1108462SApril.Chin@Sun.COM            # special case for HTML where you have something like <foo baz>
1118462SApril.Chin@Sun.COM            aa[${aa_count}]=( name="${tagstr}" )
1128462SApril.Chin@Sun.COM        fi
1138462SApril.Chin@Sun.COM        (( aa_count++ ))
1148462SApril.Chin@Sun.COM        (( aa_count > 1000 )) && fatal_error "$0: aa_count too large" # assert
1158462SApril.Chin@Sun.COM    done
1168462SApril.Chin@Sun.COM}
1178462SApril.Chin@Sun.COM
1188462SApril.Chin@Sun.COM# XML document handler
1198462SApril.Chin@Sun.COMfunction handle_xml_document
1208462SApril.Chin@Sun.COM{
1218462SApril.Chin@Sun.COM#set -o xtrace
1228462SApril.Chin@Sun.COM    nameref callbacks=${1}
1238462SApril.Chin@Sun.COM    typeset tag_type="${2}"
1248462SApril.Chin@Sun.COM    typeset tag_value="${3}"
1258462SApril.Chin@Sun.COM    typeset tag_attributes="${4}"
1268462SApril.Chin@Sun.COM    nameref doc=${callbacks["arg_tree"]}
1278462SApril.Chin@Sun.COM    nameref nodepath="${stack.items[stack.pos]}"
1288462SApril.Chin@Sun.COM    nameref nodesnum="${stack.items[stack.pos]}num"
1298462SApril.Chin@Sun.COM
1308462SApril.Chin@Sun.COM    case "${tag_type}" in
1318462SApril.Chin@Sun.COM        tag_comment)
1328462SApril.Chin@Sun.COM            nodepath[${nodesnum}]+=(
1338462SApril.Chin@Sun.COM                typeset tagtype="comment"
1348462SApril.Chin@Sun.COM                typeset tagvalue="${tag_value}"
1358462SApril.Chin@Sun.COM            )
1368462SApril.Chin@Sun.COM            (( nodesnum++ ))
1378462SApril.Chin@Sun.COM            ;;
1388462SApril.Chin@Sun.COM    esac
1398462SApril.Chin@Sun.COM
1408462SApril.Chin@Sun.COM#    print "xmltok: '${tag_type}' = '${tag_value}'"
1418462SApril.Chin@Sun.COM}
1428462SApril.Chin@Sun.COM
1438462SApril.Chin@Sun.COMfunction xml_tok
1448462SApril.Chin@Sun.COM{
1458462SApril.Chin@Sun.COM    typeset buf=""
1468462SApril.Chin@Sun.COM    typeset namebuf=""
1478462SApril.Chin@Sun.COM    typeset attrbuf=""
1488462SApril.Chin@Sun.COM    typeset c=""
1498462SApril.Chin@Sun.COM    typeset isendtag # bool: true/false
1508462SApril.Chin@Sun.COM    typeset issingletag # bool: true/false (used for tags like "<br />")
1518462SApril.Chin@Sun.COM    nameref callbacks=${1}
1528462SApril.Chin@Sun.COM
1538462SApril.Chin@Sun.COM    [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start"
1548462SApril.Chin@Sun.COM
1558462SApril.Chin@Sun.COM    while IFS='' read -r -N 1 c ; do
1568462SApril.Chin@Sun.COM        isendtag=false
1578462SApril.Chin@Sun.COM
1588462SApril.Chin@Sun.COM        if [[ "$c" == "<" ]] ; then
1598462SApril.Chin@Sun.COM	    # flush any text content
1608462SApril.Chin@Sun.COM            if [[ "$buf" != "" ]] ; then
1618462SApril.Chin@Sun.COM                [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf"
1628462SApril.Chin@Sun.COM                buf=""
1638462SApril.Chin@Sun.COM            fi
1648462SApril.Chin@Sun.COM
1658462SApril.Chin@Sun.COM            IFS='' read -r -N 1 c
1668462SApril.Chin@Sun.COM            if [[ "$c" == "/" ]] ; then
1678462SApril.Chin@Sun.COM                isendtag=true
1688462SApril.Chin@Sun.COM            else
1698462SApril.Chin@Sun.COM                buf="$c"
1708462SApril.Chin@Sun.COM            fi
1718462SApril.Chin@Sun.COM            IFS='' read -r -d '>' c
1728462SApril.Chin@Sun.COM            buf+="$c"
1738462SApril.Chin@Sun.COM
1748462SApril.Chin@Sun.COM	    # handle comments
1758462SApril.Chin@Sun.COM	    if [[ "$buf" == ~(El)!-- ]] ; then
1768462SApril.Chin@Sun.COM	        # did we read the comment completely ?
1778462SApril.Chin@Sun.COM	        if [[ "$buf" != ~(Elr)!--.*-- ]] ; then
1788462SApril.Chin@Sun.COM		    buf+=">"
1798462SApril.Chin@Sun.COM	            while [[ "$buf" != ~(Elr)!--.*-- ]] ; do
1808462SApril.Chin@Sun.COM		        IFS='' read -r -N 1 c || break
1818462SApril.Chin@Sun.COM		        buf+="$c"
1828462SApril.Chin@Sun.COM		    done
1838462SApril.Chin@Sun.COM		fi
1848462SApril.Chin@Sun.COM
1858462SApril.Chin@Sun.COM		[[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}"
1868462SApril.Chin@Sun.COM		buf=""
1878462SApril.Chin@Sun.COM		continue
1888462SApril.Chin@Sun.COM	    fi
1898462SApril.Chin@Sun.COM
1908462SApril.Chin@Sun.COM	    # check if the tag starts and ends at the same time (like "<br />")
1918462SApril.Chin@Sun.COM	    if [[ "${buf}" == ~(Er).*/ ]] ; then
1928462SApril.Chin@Sun.COM	        issingletag=true
1938462SApril.Chin@Sun.COM		buf="${buf%*/}"
1948462SApril.Chin@Sun.COM	    else
1958462SApril.Chin@Sun.COM	        issingletag=false
1968462SApril.Chin@Sun.COM	    fi
1978462SApril.Chin@Sun.COM
1988462SApril.Chin@Sun.COM	    # check if the tag has attributes (e.g. space after name)
1998462SApril.Chin@Sun.COM	    if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then
2008462SApril.Chin@Sun.COM	        namebuf="${buf%%~(E)[[:space:][:blank:]].*}"
2018462SApril.Chin@Sun.COM                attrbuf="${buf#~(E).*[[:space:][:blank:]]}"
2028462SApril.Chin@Sun.COM            else
2038462SApril.Chin@Sun.COM	        namebuf="$buf"
2048462SApril.Chin@Sun.COM		attrbuf=""
2058462SApril.Chin@Sun.COM	    fi
2068462SApril.Chin@Sun.COM
2078462SApril.Chin@Sun.COM            if ${isendtag} ; then
2088462SApril.Chin@Sun.COM                [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
2098462SApril.Chin@Sun.COM            else
2108462SApril.Chin@Sun.COM                [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf"
2118462SApril.Chin@Sun.COM
2128462SApril.Chin@Sun.COM                # handle tags like <br/> (which are start- and end-tag in one piece)
2138462SApril.Chin@Sun.COM                if ${issingletag} ; then
2148462SApril.Chin@Sun.COM                    [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
2158462SApril.Chin@Sun.COM                fi
2168462SApril.Chin@Sun.COM            fi
2178462SApril.Chin@Sun.COM            buf=""
2188462SApril.Chin@Sun.COM        else
2198462SApril.Chin@Sun.COM            buf+="$c"
2208462SApril.Chin@Sun.COM        fi
2218462SApril.Chin@Sun.COM    done
2228462SApril.Chin@Sun.COM
2238462SApril.Chin@Sun.COM    [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success"
2248462SApril.Chin@Sun.COM
2258462SApril.Chin@Sun.COM    print # final newline to make filters like "sed" happy
2268462SApril.Chin@Sun.COM}
2278462SApril.Chin@Sun.COM
2288462SApril.Chin@Sun.COM# enumerate comments in a shell (or shell-like) script
2298462SApril.Chin@Sun.COMfunction enumerate_comments_shell
2308462SApril.Chin@Sun.COM{
2318462SApril.Chin@Sun.COM	set -o errexit
2328462SApril.Chin@Sun.COM
2338462SApril.Chin@Sun.COM	typeset input_file="$1"
2348462SApril.Chin@Sun.COM	nameref comment_array="$2"
2358462SApril.Chin@Sun.COM	integer max_num_comments="$3"
2368462SApril.Chin@Sun.COM	integer ca=0 # index in "comment_array"
2378462SApril.Chin@Sun.COM
2388462SApril.Chin@Sun.COM	integer res=0
2398462SApril.Chin@Sun.COM
2408462SApril.Chin@Sun.COM	typeset comment=""
2418462SApril.Chin@Sun.COM
2428462SApril.Chin@Sun.COM	while (( res == 0 )) ; do
2438462SApril.Chin@Sun.COM		IFS='' read -r line
2448462SApril.Chin@Sun.COM		(( res=$? ))
2458462SApril.Chin@Sun.COM
2468462SApril.Chin@Sun.COM		if [[ "${line}" == ~(El)#.* ]] ; then
2478462SApril.Chin@Sun.COM			comment+="${line#\#}${ch.newline}"
2488462SApril.Chin@Sun.COM		else
2498462SApril.Chin@Sun.COM			if [[ "$comment" != "" ]] ; then
2508462SApril.Chin@Sun.COM				comment_array[ca++]="${comment}"
2518462SApril.Chin@Sun.COM				comment=""
2528462SApril.Chin@Sun.COM
2538462SApril.Chin@Sun.COM				if (( ca > max_num_comments )) ; then
2548462SApril.Chin@Sun.COM					break
2558462SApril.Chin@Sun.COM				fi
2568462SApril.Chin@Sun.COM			fi
2578462SApril.Chin@Sun.COM		fi
2588462SApril.Chin@Sun.COM	done <"${input_file}"
2598462SApril.Chin@Sun.COM
2608462SApril.Chin@Sun.COM	return 0
2618462SApril.Chin@Sun.COM}
2628462SApril.Chin@Sun.COM
2638462SApril.Chin@Sun.COM
2648462SApril.Chin@Sun.COM# enumerate comments in a troff document
2658462SApril.Chin@Sun.COMfunction enumerate_comments_troff
2668462SApril.Chin@Sun.COM{
2678462SApril.Chin@Sun.COM	set -o errexit
2688462SApril.Chin@Sun.COM
2698462SApril.Chin@Sun.COM	typeset input_file="$1"
2708462SApril.Chin@Sun.COM	nameref comment_array="$2"
2718462SApril.Chin@Sun.COM	integer max_num_comments="$3"
2728462SApril.Chin@Sun.COM	integer ca=0 # index in "comment_array"
2738462SApril.Chin@Sun.COM
2748462SApril.Chin@Sun.COM	integer res=0
2758462SApril.Chin@Sun.COM
2768462SApril.Chin@Sun.COM	typeset comment=""
2778462SApril.Chin@Sun.COM
2788462SApril.Chin@Sun.COM	while (( res == 0 )) ; do
2798462SApril.Chin@Sun.COM		IFS='' read -r line
2808462SApril.Chin@Sun.COM		(( res=$? ))
2818462SApril.Chin@Sun.COM
2828462SApril.Chin@Sun.COM		if [[ "${line}" == ~(El)\.*\\\" ]] ; then
2838462SApril.Chin@Sun.COM			comment+="${line#~(El)\.*\\\"}${ch.newline}"
2848462SApril.Chin@Sun.COM		else
2858462SApril.Chin@Sun.COM			if [[ "$comment" != "" ]] ; then
2868462SApril.Chin@Sun.COM				comment_array[ca++]="${comment}"
2878462SApril.Chin@Sun.COM				comment=""
2888462SApril.Chin@Sun.COM
2898462SApril.Chin@Sun.COM				if (( ca > max_num_comments )) ; then
2908462SApril.Chin@Sun.COM					break
2918462SApril.Chin@Sun.COM				fi
2928462SApril.Chin@Sun.COM			fi
2938462SApril.Chin@Sun.COM		fi
2948462SApril.Chin@Sun.COM	done <"${input_file}"
2958462SApril.Chin@Sun.COM
2968462SApril.Chin@Sun.COM	return 0
2978462SApril.Chin@Sun.COM}
2988462SApril.Chin@Sun.COM
2998462SApril.Chin@Sun.COM
3008462SApril.Chin@Sun.COM# enumerate comments in files which are preprocessed by
3018462SApril.Chin@Sun.COM# CPP (e.g. C, C++, Imakefile etc.)
3028462SApril.Chin@Sun.COMfunction enumerate_comments_cpp
3038462SApril.Chin@Sun.COM{
3048462SApril.Chin@Sun.COM	set -o errexit
3058462SApril.Chin@Sun.COM#	set -o nounset
3068462SApril.Chin@Sun.COM
3078462SApril.Chin@Sun.COM	integer err=0
3088462SApril.Chin@Sun.COM
3098462SApril.Chin@Sun.COM	typeset input_file="$1"
3108462SApril.Chin@Sun.COM	nameref comment_array="$2"
3118462SApril.Chin@Sun.COM	integer max_num_comments="$3"
3128462SApril.Chin@Sun.COM	integer max_filesize_for_scan="$4"
3138462SApril.Chin@Sun.COM	integer ca=0 # index in "comment_array"
3148462SApril.Chin@Sun.COM
3158462SApril.Chin@Sun.COM	typeset content
3168462SApril.Chin@Sun.COM	integer content_length
3178462SApril.Chin@Sun.COM
3188462SApril.Chin@Sun.COM	integer file_pos # file position
31910898Sroland.mainz@nrubsig.org	compound line_pos=(
3208462SApril.Chin@Sun.COM		integer x=0 # X position in line
3218462SApril.Chin@Sun.COM		integer y=0 # Y position in line (line number)
3228462SApril.Chin@Sun.COM	)
3238462SApril.Chin@Sun.COM	typeset c c2
3248462SApril.Chin@Sun.COM
3258462SApril.Chin@Sun.COM	typeset comment
3268462SApril.Chin@Sun.COM
32710898Sroland.mainz@nrubsig.org	compound state=(
3288462SApril.Chin@Sun.COM		# C comment state
3298462SApril.Chin@Sun.COM		typeset in_c_comment=false
3308462SApril.Chin@Sun.COM		# C++ comment state
33110898Sroland.mainz@nrubsig.org		compound cxx=(
3328462SApril.Chin@Sun.COM			typeset in_comment=false
3338462SApril.Chin@Sun.COM			typeset comment_continued=false
3348462SApril.Chin@Sun.COM			# position of current //-pos
33510898Sroland.mainz@nrubsig.org			compound comment_pos=(
3368462SApril.Chin@Sun.COM				integer x=-1
3378462SApril.Chin@Sun.COM				integer y=-1
3388462SApril.Chin@Sun.COM			)
3398462SApril.Chin@Sun.COM			# position of previous //-pos
34010898Sroland.mainz@nrubsig.org			compound comment_prev_pos=(
3418462SApril.Chin@Sun.COM				integer x=-1
3428462SApril.Chin@Sun.COM				integer y=-1
3438462SApril.Chin@Sun.COM			)
3448462SApril.Chin@Sun.COM		)
3458462SApril.Chin@Sun.COM		# literal state
3468462SApril.Chin@Sun.COM		typeset in_sq_literal=false # single-quote literal
3478462SApril.Chin@Sun.COM		typeset in_dq_literal=false # double-quote literal
3488462SApril.Chin@Sun.COM	)
3498462SApril.Chin@Sun.COM
3508462SApril.Chin@Sun.COM	content="$(< "${input_file}")"
3518462SApril.Chin@Sun.COM
3528462SApril.Chin@Sun.COM	# Truncate file to "max_filesize_for_scan" charatcters.
3538462SApril.Chin@Sun.COM	# This was originally added to work around a performance problem with
3548462SApril.Chin@Sun.COM	# the ${str:offset:chunksize} operator which scales badly in ksh93
3558462SApril.Chin@Sun.COM	# version 's' with the number of characters
3568462SApril.Chin@Sun.COM	if (( ${#content} > max_filesize_for_scan )) ; then
3578462SApril.Chin@Sun.COM		print -u2 -f "## WARNING: File '%s' truncated to %d characters\n" \
3588462SApril.Chin@Sun.COM			"${input_file}" \
3598462SApril.Chin@Sun.COM			max_filesize_for_scan
3608462SApril.Chin@Sun.COM		content="${content:0:max_filesize_for_scan}"
3618462SApril.Chin@Sun.COM	fi
3628462SApril.Chin@Sun.COM	content_length=${#content}
3638462SApril.Chin@Sun.COM
3648462SApril.Chin@Sun.COM	# Iterate through the source code. The last character
3658462SApril.Chin@Sun.COM	# (when file_pos == content_length) will be empty to indicate
3668462SApril.Chin@Sun.COM	# EOF (this is needed for cases like when
3678462SApril.Chin@Sun.COM	# a C++ comment is not terminated by a newline... ;-/)
3688462SApril.Chin@Sun.COM	for (( file_pos=0 ; file_pos <= content_length ; file_pos++ )) ; do
3698462SApril.Chin@Sun.COM		c2="${content:file_pos:2}"
3708462SApril.Chin@Sun.COM		c="${c2:0:1}"
3718462SApril.Chin@Sun.COM
3728462SApril.Chin@Sun.COM		if [[ "$c" == "${ch.newline}" ]] ; then
3738462SApril.Chin@Sun.COM			(( line_pos.x=0, line_pos.y++ ))
3748462SApril.Chin@Sun.COM		else
3758462SApril.Chin@Sun.COM			(( line_pos.x++ ))
3768462SApril.Chin@Sun.COM		fi
3778462SApril.Chin@Sun.COM
3788462SApril.Chin@Sun.COM		if ${state.in_c_comment} ; then
3798462SApril.Chin@Sun.COM			if [[ "$c2" == "*/" ]] ; then
3808462SApril.Chin@Sun.COM				(( file_pos++, line_pos.x++ ))
3818462SApril.Chin@Sun.COM				state.in_c_comment=false
3828462SApril.Chin@Sun.COM
3838462SApril.Chin@Sun.COM				# flush comment text
3848462SApril.Chin@Sun.COM				comment_array[ca++]="${comment}"
3858462SApril.Chin@Sun.COM				comment=""
3868462SApril.Chin@Sun.COM
3878462SApril.Chin@Sun.COM				if (( ca > max_num_comments )) ; then
3888462SApril.Chin@Sun.COM					break
3898462SApril.Chin@Sun.COM				fi
3908462SApril.Chin@Sun.COM			else
3918462SApril.Chin@Sun.COM				comment+="$c"
3928462SApril.Chin@Sun.COM			fi
3938462SApril.Chin@Sun.COM		elif ${state.cxx.in_comment} ; then
3948462SApril.Chin@Sun.COM			if [[ "$c" == "${ch.newline}" || "$c" == "" ]] ; then
3958462SApril.Chin@Sun.COM				state.cxx.in_comment=false
3968462SApril.Chin@Sun.COM
3978462SApril.Chin@Sun.COM				# flush comment text
3988462SApril.Chin@Sun.COM				if ${state.cxx.comment_continued} ; then
3998462SApril.Chin@Sun.COM					comment_array[ca-1]+="${ch.newline}${comment}"
4008462SApril.Chin@Sun.COM					(( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x ,
4018462SApril.Chin@Sun.COM					   state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y ))
4028462SApril.Chin@Sun.COM				else
4038462SApril.Chin@Sun.COM					comment_array[ca++]="${comment}"
4048462SApril.Chin@Sun.COM					(( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x ,
4058462SApril.Chin@Sun.COM					   state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y ))
4068462SApril.Chin@Sun.COM				fi
4078462SApril.Chin@Sun.COM				comment=""
4088462SApril.Chin@Sun.COM
4098462SApril.Chin@Sun.COM				if (( ca > max_num_comments )) ; then
4108462SApril.Chin@Sun.COM					break
4118462SApril.Chin@Sun.COM				fi
4128462SApril.Chin@Sun.COM			else
4138462SApril.Chin@Sun.COM				comment+="$c"
4148462SApril.Chin@Sun.COM			fi
4158462SApril.Chin@Sun.COM		elif ${state.in_sq_literal} ; then
4168462SApril.Chin@Sun.COM			if [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then
4178462SApril.Chin@Sun.COM				state.in_sq_literal=false
4188462SApril.Chin@Sun.COM			fi
4198462SApril.Chin@Sun.COM		elif ${state.in_dq_literal} ; then
4208462SApril.Chin@Sun.COM			if [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then
4218462SApril.Chin@Sun.COM				state.in_dq_literal=false
4228462SApril.Chin@Sun.COM			fi
4238462SApril.Chin@Sun.COM		else
4248462SApril.Chin@Sun.COM			if [[ "$c2" == "/*" ]] ; then
4258462SApril.Chin@Sun.COM				(( file_pos++, line_pos.x++ ))
4268462SApril.Chin@Sun.COM				state.in_c_comment=true
4278462SApril.Chin@Sun.COM				comment=""
4288462SApril.Chin@Sun.COM			elif [[ "$c2" == "//" ]] ; then
4298462SApril.Chin@Sun.COM				(( file_pos++, line_pos.x++ ))
4308462SApril.Chin@Sun.COM				if (( state.cxx.comment_prev_pos.x == line_pos.x && \
4318462SApril.Chin@Sun.COM					state.cxx.comment_prev_pos.y == (line_pos.y-1) )) ; then
4328462SApril.Chin@Sun.COM					state.cxx.comment_continued=true
4338462SApril.Chin@Sun.COM			else
4348462SApril.Chin@Sun.COM				state.cxx.comment_continued=false
4358462SApril.Chin@Sun.COM			fi
4368462SApril.Chin@Sun.COM			(( state.cxx.comment_pos.x=line_pos.x , state.cxx.comment_pos.y=line_pos.y ))
4378462SApril.Chin@Sun.COM			state.cxx.in_comment=true
4388462SApril.Chin@Sun.COM			comment=""
4398462SApril.Chin@Sun.COM			elif [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then
4408462SApril.Chin@Sun.COM				state.in_sq_literal=true
4418462SApril.Chin@Sun.COM			elif [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then
4428462SApril.Chin@Sun.COM				state.in_dq_literal=true
4438462SApril.Chin@Sun.COM			fi
4448462SApril.Chin@Sun.COM		fi
4458462SApril.Chin@Sun.COM	done
4468462SApril.Chin@Sun.COM
4478462SApril.Chin@Sun.COM	if [[ "$comment" != "" ]] ; then
4488462SApril.Chin@Sun.COM		print -u2 "## ERROR: Comment text buffer not empty at EOF."
4498462SApril.Chin@Sun.COM		err=1
4508462SApril.Chin@Sun.COM	fi
4518462SApril.Chin@Sun.COM
4528462SApril.Chin@Sun.COM	if ${state.in_c_comment} ; then
4538462SApril.Chin@Sun.COM		print -u2 "## ERROR: C comment did not close before EOF."
4548462SApril.Chin@Sun.COM		err=1
4558462SApril.Chin@Sun.COM	fi
4568462SApril.Chin@Sun.COM
4578462SApril.Chin@Sun.COM	if ${state.cxx.in_comment} ; then
4588462SApril.Chin@Sun.COM		print -u2 "## ERROR: C++ comment did not close before EOF."
4598462SApril.Chin@Sun.COM		err=1
4608462SApril.Chin@Sun.COM	fi
4618462SApril.Chin@Sun.COM
4628462SApril.Chin@Sun.COM	if ${state.in_dq_literal} ; then
4638462SApril.Chin@Sun.COM		print -u2 "## ERROR: Double-quoted literal did not close before EOF."
4648462SApril.Chin@Sun.COM		err=1
4658462SApril.Chin@Sun.COM	fi
4668462SApril.Chin@Sun.COM
4678462SApril.Chin@Sun.COM	# We treat this one only as warning since things like "foo.html.cpp" may
4688462SApril.Chin@Sun.COM	# trigger this condition accidently
4698462SApril.Chin@Sun.COM	if ${state.in_sq_literal} ; then
4708462SApril.Chin@Sun.COM		print -u2 "## WARNING: Single-quoted literal did not close before EOF."
4718462SApril.Chin@Sun.COM	fi
4728462SApril.Chin@Sun.COM
4738462SApril.Chin@Sun.COM	return $err
4748462SApril.Chin@Sun.COM}
4758462SApril.Chin@Sun.COM
4768462SApril.Chin@Sun.COM# determine file type
4778462SApril.Chin@Sun.COMfunction get_file_format
4788462SApril.Chin@Sun.COM{
4798462SApril.Chin@Sun.COM	set -o errexit
4808462SApril.Chin@Sun.COM
4818462SApril.Chin@Sun.COM	typeset filename="$1"
4828462SApril.Chin@Sun.COM	nameref file_format="$2"
4838462SApril.Chin@Sun.COM
4848462SApril.Chin@Sun.COM	typeset fileeval # evaluation result of /usr/bin/file
4858462SApril.Chin@Sun.COM
4868462SApril.Chin@Sun.COM	# check whether "filename" is a plain, readable file
4878462SApril.Chin@Sun.COM	[[ ! -f "$filename" ]] && return 1
4888462SApril.Chin@Sun.COM	[[ ! -r "$filename" ]] && return 1
4898462SApril.Chin@Sun.COM
4908462SApril.Chin@Sun.COM	# In theory this code would exclusively look at the contents of
4918462SApril.Chin@Sun.COM	# the file to figure out it's file format - unfortunately
4928462SApril.Chin@Sun.COM	# /usr/bin/file is virtually useless (the heuristics, matching
4938462SApril.Chin@Sun.COM	# and output unreliable) for many file formats and therefore
4948462SApril.Chin@Sun.COM	# we have to do a multi-stage approach which looks
4958462SApril.Chin@Sun.COM	# at the file's content if possible and at the filename
4968462SApril.Chin@Sun.COM	# otherwise. Fun... ;-(
4978462SApril.Chin@Sun.COM
4988462SApril.Chin@Sun.COM	# pass one: Find matches for file formats where /usr/bin/file
4998462SApril.Chin@Sun.COM	# is known to be unreliable:
5008462SApril.Chin@Sun.COM	case "$filename" in
5018462SApril.Chin@Sun.COM		*.[ch] | *.cpp | *.cc | *.cxx | *.hxx)
5028462SApril.Chin@Sun.COM			file_format="c_source"
5038462SApril.Chin@Sun.COM			return 0
5048462SApril.Chin@Sun.COM			;;
5058462SApril.Chin@Sun.COM		*Imakefile)
5068462SApril.Chin@Sun.COM			file_format="imakefile"
5078462SApril.Chin@Sun.COM			return 0
5088462SApril.Chin@Sun.COM			;;
5098462SApril.Chin@Sun.COM		*Makefile)
5108462SApril.Chin@Sun.COM			file_format="makefile"
5118462SApril.Chin@Sun.COM			return 0
5128462SApril.Chin@Sun.COM			;;
5138462SApril.Chin@Sun.COM	esac
5148462SApril.Chin@Sun.COM
5158462SApril.Chin@Sun.COM	# pass two: match by file content via /usr/bin/file
5168462SApril.Chin@Sun.COM	fileeval="$(LC_ALL=C /usr/bin/file "$filename")"
5178462SApril.Chin@Sun.COM	case "$fileeval" in
5188462SApril.Chin@Sun.COM		~(E)roff)
5198462SApril.Chin@Sun.COM			file_format="troff"
5208462SApril.Chin@Sun.COM			return 0
5218462SApril.Chin@Sun.COM			;;
5228462SApril.Chin@Sun.COM		~(E)html\ document)
5238462SApril.Chin@Sun.COM			file_format="html"
5248462SApril.Chin@Sun.COM			return 0
5258462SApril.Chin@Sun.COM			;;
5268462SApril.Chin@Sun.COM		~(E)sgml\ document)
5278462SApril.Chin@Sun.COM			file_format="sgml"
5288462SApril.Chin@Sun.COM			return 0
5298462SApril.Chin@Sun.COM			;;
5308462SApril.Chin@Sun.COM		~(E)executable.*(shell|(/|/r|/pf)(sh|ksh|ksh93|rksh93|dtksh|tksh|bash))\ script)
5318462SApril.Chin@Sun.COM			file_format="shell"
5328462SApril.Chin@Sun.COM			return 0
5338462SApril.Chin@Sun.COM			;;
5348462SApril.Chin@Sun.COM		~(E)executable.*/perl\ script)
5358462SApril.Chin@Sun.COM			file_format="perl"
5368462SApril.Chin@Sun.COM			return 0
5378462SApril.Chin@Sun.COM			;;
5388462SApril.Chin@Sun.COM	esac
5398462SApril.Chin@Sun.COM
5408462SApril.Chin@Sun.COM	# pass three: fallhack to filename matching
5418462SApril.Chin@Sun.COM	case "$filename" in
5428462SApril.Chin@Sun.COM		*.man)
5438462SApril.Chin@Sun.COM			file_format="troff"
5448462SApril.Chin@Sun.COM			return 0
5458462SApril.Chin@Sun.COM			;;
5468462SApril.Chin@Sun.COM		*.html)
5478462SApril.Chin@Sun.COM			file_format="html"
5488462SApril.Chin@Sun.COM			return 0
5498462SApril.Chin@Sun.COM			;;
5508462SApril.Chin@Sun.COM		*.sgml)
5518462SApril.Chin@Sun.COM			file_format="sgml"
5528462SApril.Chin@Sun.COM			return 0
5538462SApril.Chin@Sun.COM			;;
5548462SApril.Chin@Sun.COM		*.xml)
5558462SApril.Chin@Sun.COM			file_format="xml"
5568462SApril.Chin@Sun.COM			return 0
5578462SApril.Chin@Sun.COM			;;
5588462SApril.Chin@Sun.COM		*.png)
5598462SApril.Chin@Sun.COM			file_format="image_png"
5608462SApril.Chin@Sun.COM			return 0
5618462SApril.Chin@Sun.COM			;;
5628462SApril.Chin@Sun.COM		*.xcf)
5638462SApril.Chin@Sun.COM			file_format="image_xcf"
5648462SApril.Chin@Sun.COM			return 0
5658462SApril.Chin@Sun.COM			;;
5668462SApril.Chin@Sun.COM		*.shar)
5678462SApril.Chin@Sun.COM			file_format="archive_shell"
5688462SApril.Chin@Sun.COM			return 0
5698462SApril.Chin@Sun.COM			;;
5708462SApril.Chin@Sun.COM		*.sh)
5718462SApril.Chin@Sun.COM			file_format="shell"
5728462SApril.Chin@Sun.COM			return 0
5738462SApril.Chin@Sun.COM			;;
5748462SApril.Chin@Sun.COM		*.pcf)
5758462SApril.Chin@Sun.COM			file_format="font_pcf"
5768462SApril.Chin@Sun.COM			return 0
5778462SApril.Chin@Sun.COM			;;
5788462SApril.Chin@Sun.COM		*.bdf)
5798462SApril.Chin@Sun.COM			file_format="font_bdf"
5808462SApril.Chin@Sun.COM			return 0
5818462SApril.Chin@Sun.COM			;;
5828462SApril.Chin@Sun.COM		*.pmf)
5838462SApril.Chin@Sun.COM			file_format="font_pmf"
5848462SApril.Chin@Sun.COM			return 0
5858462SApril.Chin@Sun.COM			;;
5868462SApril.Chin@Sun.COM		*.ttf | *.otf)
5878462SApril.Chin@Sun.COM			file_format="font_ttf"
5888462SApril.Chin@Sun.COM			return 0
5898462SApril.Chin@Sun.COM			;;
5908462SApril.Chin@Sun.COM		*.pfa | *.pfb)
5918462SApril.Chin@Sun.COM			file_format="font_postscript"
5928462SApril.Chin@Sun.COM			return 0
5938462SApril.Chin@Sun.COM			;;
5948462SApril.Chin@Sun.COM	esac
5958462SApril.Chin@Sun.COM
5968462SApril.Chin@Sun.COM	return 1
5978462SApril.Chin@Sun.COM}
5988462SApril.Chin@Sun.COM
5998462SApril.Chin@Sun.COMfunction extract_comments
6008462SApril.Chin@Sun.COM{
6018462SApril.Chin@Sun.COM	set -o errexit
6028462SApril.Chin@Sun.COM
6038462SApril.Chin@Sun.COM	nameref records="$1"
6048462SApril.Chin@Sun.COM	typeset filename="$2"
6058462SApril.Chin@Sun.COM	integer max_num_comments="$3"
6068462SApril.Chin@Sun.COM	integer max_filesize_for_scan="$4"
6078462SApril.Chin@Sun.COM
6088462SApril.Chin@Sun.COM	typeset datatype=""
6098462SApril.Chin@Sun.COM
6108462SApril.Chin@Sun.COM	records[${filename}]=(
6118462SApril.Chin@Sun.COM		typeset filename="$filename"
6128462SApril.Chin@Sun.COM
6138462SApril.Chin@Sun.COM		typeset fileformat_found="false" # "true" or "false"
6148462SApril.Chin@Sun.COM		typeset file_format=""
6158462SApril.Chin@Sun.COM
6168462SApril.Chin@Sun.COM		typeset -A hashsum
6178462SApril.Chin@Sun.COM
6188462SApril.Chin@Sun.COM		typeset comments_parsed="false" # "true" or "false"
6198462SApril.Chin@Sun.COM		typeset -a comments
6208462SApril.Chin@Sun.COM	)
6218462SApril.Chin@Sun.COM
6228462SApril.Chin@Sun.COM	records[${filename}].hashsum["md5"]="$(sum  -x md5  < "$filename")"
6238462SApril.Chin@Sun.COM	records[${filename}].hashsum["sha1"]="$(sum -x sha1 < "$filename")"
6248462SApril.Chin@Sun.COM
6258462SApril.Chin@Sun.COM	if get_file_format "$filename" datatype ; then
6268462SApril.Chin@Sun.COM		records[${filename}].fileformat_found="true"
6278462SApril.Chin@Sun.COM		records[${filename}].file_format="$datatype"
6288462SApril.Chin@Sun.COM	else
6298462SApril.Chin@Sun.COM		return 1
6308462SApril.Chin@Sun.COM	fi
6318462SApril.Chin@Sun.COM
6328462SApril.Chin@Sun.COM	case "$datatype" in
6338462SApril.Chin@Sun.COM		c_source|imakefile)
6348462SApril.Chin@Sun.COM			enumerate_comments_cpp "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
6358462SApril.Chin@Sun.COM				records[${filename}].comments_parsed=true
6368462SApril.Chin@Sun.COM			;;
6378462SApril.Chin@Sun.COM		shell|makefile)
6388462SApril.Chin@Sun.COM			enumerate_comments_shell "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
6398462SApril.Chin@Sun.COM				records[${filename}].comments_parsed=true
6408462SApril.Chin@Sun.COM			;;
6418462SApril.Chin@Sun.COM		troff)
6428462SApril.Chin@Sun.COM			enumerate_comments_troff "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
6438462SApril.Chin@Sun.COM				records[${filename}].comments_parsed=true
6448462SApril.Chin@Sun.COM			;;
6458462SApril.Chin@Sun.COM		# NOTE: Disabled for now
6468462SApril.Chin@Sun.COM		#xml|html|sgml)
6478462SApril.Chin@Sun.COM		#	enumerate_comments_xml "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
6488462SApril.Chin@Sun.COM		#		records[${filename}].comments_parsed=true
6498462SApril.Chin@Sun.COM		#	;;
6508462SApril.Chin@Sun.COM	esac
6518462SApril.Chin@Sun.COM
6528462SApril.Chin@Sun.COM	return 0
6538462SApril.Chin@Sun.COM}
6548462SApril.Chin@Sun.COM
6558462SApril.Chin@Sun.COM# parse HTTP return code, cookies etc.
6568462SApril.Chin@Sun.COMfunction parse_http_response
6578462SApril.Chin@Sun.COM{
6588462SApril.Chin@Sun.COM	nameref response="$1"
6598462SApril.Chin@Sun.COM	typeset h statuscode statusmsg i
6608462SApril.Chin@Sun.COM
6618462SApril.Chin@Sun.COM	# we use '\r' as additional IFS to filter the final '\r'
6628462SApril.Chin@Sun.COM	IFS=$' \t\r' read -r h statuscode statusmsg  # read HTTP/1.[01] <code>
6638462SApril.Chin@Sun.COM	[[ "$h" != ~(Eil)HTTP/.* ]]         && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; }
6648462SApril.Chin@Sun.COM	[[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n"  "$0" ; return 1 ; }
6658462SApril.Chin@Sun.COM	response.statuscode="$statuscode"
6668462SApril.Chin@Sun.COM	response.statusmsg="$statusmsg"
6678462SApril.Chin@Sun.COM
6688462SApril.Chin@Sun.COM	# skip remaining headers
6698462SApril.Chin@Sun.COM	while IFS='' read -r i ; do
6708462SApril.Chin@Sun.COM		[[ "$i" == $'\r' ]] && break
6718462SApril.Chin@Sun.COM
6728462SApril.Chin@Sun.COM		# strip '\r' at the end
6738462SApril.Chin@Sun.COM		i="${i/~(Er)$'\r'/}"
6748462SApril.Chin@Sun.COM
6758462SApril.Chin@Sun.COM		case "$i" in
6768462SApril.Chin@Sun.COM			~(Eli)Content-Type:.*)
6778462SApril.Chin@Sun.COM				response.content_type="${i/~(El).*:[[:blank:]]*/}"
6788462SApril.Chin@Sun.COM				;;
6798462SApril.Chin@Sun.COM			~(Eli)Content-Length:[[:blank:]]*[0-9]*)
6808462SApril.Chin@Sun.COM				integer response.content_length="${i/~(El).*:[[:blank:]]*/}"
6818462SApril.Chin@Sun.COM				;;
6828462SApril.Chin@Sun.COM			~(Eli)Transfer-Encoding:.*)
6838462SApril.Chin@Sun.COM				response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}"
6848462SApril.Chin@Sun.COM				;;
6858462SApril.Chin@Sun.COM		esac
6868462SApril.Chin@Sun.COM	done
6878462SApril.Chin@Sun.COM
6888462SApril.Chin@Sun.COM	return 0
6898462SApril.Chin@Sun.COM}
6908462SApril.Chin@Sun.COM
6918462SApril.Chin@Sun.COMfunction cat_http_body
6928462SApril.Chin@Sun.COM{
6938462SApril.Chin@Sun.COM	typeset emode="$1"
6948462SApril.Chin@Sun.COM	typeset hexchunksize="0"
6958462SApril.Chin@Sun.COM	integer chunksize=0
6968462SApril.Chin@Sun.COM
6978462SApril.Chin@Sun.COM	if [[ "${emode}" == "chunked" ]] ; then
6988462SApril.Chin@Sun.COM		while IFS=$'\r' read hexchunksize &&
699*12068SRoger.Faulkner@Oracle.COM			[[ "${hexchunksize}" == ~(Elri)[0-9abcdef]+ ]] &&
700*12068SRoger.Faulkner@Oracle.COM			(( chunksize=$( printf "16#%s\n" "${hexchunksize}" )  )) && (( chunksize > 0 )) ; do
7018462SApril.Chin@Sun.COM			dd bs=1 count="${chunksize}" 2>/dev/null
7028462SApril.Chin@Sun.COM		done
7038462SApril.Chin@Sun.COM	else
7048462SApril.Chin@Sun.COM		cat
7058462SApril.Chin@Sun.COM	fi
7068462SApril.Chin@Sun.COM
7078462SApril.Chin@Sun.COM	return 0
7088462SApril.Chin@Sun.COM}
7098462SApril.Chin@Sun.COM
71010898Sroland.mainz@nrubsig.orgfunction cat_url
7118462SApril.Chin@Sun.COM{
7128462SApril.Chin@Sun.COM	typeset protocol="${1%://*}"
7138462SApril.Chin@Sun.COM	typeset path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html"
71410898Sroland.mainz@nrubsig.org
71510898Sroland.mainz@nrubsig.org	if [[ "${protocol}" == "file" ]] ; then
71610898Sroland.mainz@nrubsig.org		cat "${path1}"
71710898Sroland.mainz@nrubsig.org		return $?
71810898Sroland.mainz@nrubsig.org	elif [[ "${protocol}" == ~(Elr)http(|s) ]] ; then
71910898Sroland.mainz@nrubsig.org		typeset host="${path1%%/*}"
72010898Sroland.mainz@nrubsig.org		typeset path="${path1#*/}"
72110898Sroland.mainz@nrubsig.org		typeset port="${host##*:}"
7228462SApril.Chin@Sun.COM
72310898Sroland.mainz@nrubsig.org		integer netfd
72410898Sroland.mainz@nrubsig.org		compound httpresponse # http response
7258462SApril.Chin@Sun.COM
72610898Sroland.mainz@nrubsig.org		# If URL did not contain a port number in the host part then look at the
72710898Sroland.mainz@nrubsig.org		# protocol to get the port number
72810898Sroland.mainz@nrubsig.org		if [[ "${port}" == "${host}" ]] ; then
72910898Sroland.mainz@nrubsig.org			case "${protocol}" in
73010898Sroland.mainz@nrubsig.org				"http")  port=80 ;;
73110898Sroland.mainz@nrubsig.org				"https") port=443 ;;
73210898Sroland.mainz@nrubsig.org				*)       port="$(getent services "${protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;;
73310898Sroland.mainz@nrubsig.org			esac
73410898Sroland.mainz@nrubsig.org		else
73510898Sroland.mainz@nrubsig.org			host="${host%:*}"
73610898Sroland.mainz@nrubsig.org		fi
7378462SApril.Chin@Sun.COM
73810898Sroland.mainz@nrubsig.org		printmsg "protocol=${protocol} port=${port} host=${host} path=${path}"
7398462SApril.Chin@Sun.COM
74010898Sroland.mainz@nrubsig.org		# prechecks
74110898Sroland.mainz@nrubsig.org		[[ "${protocol}" != "" ]] || { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; }
74210898Sroland.mainz@nrubsig.org		[[ "${port}"     != "" ]] || { print -u2 -f "%s: port not set.\n"     "$0" ; return 1 ; }
74310898Sroland.mainz@nrubsig.org		[[ "${host}"     != "" ]] || { print -u2 -f "%s: host not set.\n"     "$0" ; return 1 ; }
74410898Sroland.mainz@nrubsig.org		[[ "${path}"     != "" ]] || { print -u2 -f "%s: path not set.\n"     "$0" ; return 1 ; }
7458462SApril.Chin@Sun.COM
74610898Sroland.mainz@nrubsig.org		# open TCP channel
74710898Sroland.mainz@nrubsig.org		if [[ "${protocol}" == "https" ]] ; then
74810898Sroland.mainz@nrubsig.org			compound sslfifo
74910898Sroland.mainz@nrubsig.org			sslfifo.dir="$(mktemp -d)"
75010898Sroland.mainz@nrubsig.org			sslfifo.in="${sslfifo.dir}/in"
75110898Sroland.mainz@nrubsig.org			sslfifo.out="${sslfifo.dir}/out"
75210898Sroland.mainz@nrubsig.org
75310898Sroland.mainz@nrubsig.org			# register an EXIT trap and use "errexit" to leave it at the first error
75410898Sroland.mainz@nrubsig.org			# (this saves lots of if/fi tests for error checking)
75510898Sroland.mainz@nrubsig.org			trap "rm -r \"${sslfifo.dir}\"" EXIT
75610898Sroland.mainz@nrubsig.org			set -o errexit
75710898Sroland.mainz@nrubsig.org
75810898Sroland.mainz@nrubsig.org			mkfifo "${sslfifo.in}" "${sslfifo.out}"
75910898Sroland.mainz@nrubsig.org
76010898Sroland.mainz@nrubsig.org			# create async openssl child to handle https
76110898Sroland.mainz@nrubsig.org			openssl s_client -quiet -connect "${host}:${port}" <"${sslfifo.in}" >>"${sslfifo.out}" &
7628462SApril.Chin@Sun.COM
76310898Sroland.mainz@nrubsig.org			# send HTTP request
76410898Sroland.mainz@nrubsig.org			request="GET /${path} HTTP/1.1\r\n"
76510898Sroland.mainz@nrubsig.org			request+="Host: ${host}\r\n"
766*12068SRoger.Faulkner@Oracle.COM			request+="User-Agent: crawlsrccomments/ksh93(ssl) (2010-03-27; $(uname -s -r -p))\r\n"
76710898Sroland.mainz@nrubsig.org			request+="Connection: close\r\n"
76810898Sroland.mainz@nrubsig.org			print -n -- "${request}\r\n" >>	"${sslfifo.in}"
76910898Sroland.mainz@nrubsig.org
77010898Sroland.mainz@nrubsig.org			# collect response and send it to stdout
77110898Sroland.mainz@nrubsig.org			{
77210898Sroland.mainz@nrubsig.org				parse_http_response httpresponse
77310898Sroland.mainz@nrubsig.org				cat_http_body "${httpresponse.transfer_encoding}"
77410898Sroland.mainz@nrubsig.org			} <"${sslfifo.out}"
77510898Sroland.mainz@nrubsig.org
77610898Sroland.mainz@nrubsig.org			wait || { print -u2 -f "%s: openssl failed.\n" ; exit 1 ; }
77710898Sroland.mainz@nrubsig.org
77810898Sroland.mainz@nrubsig.org			return 0
77910898Sroland.mainz@nrubsig.org		else
78010898Sroland.mainz@nrubsig.org			redirect {netfd}<> "/dev/tcp/${host}/${port}"
78110898Sroland.mainz@nrubsig.org			(( $? != 0 )) && { print -u2 -f "%s: Could not open %s\n" "$0" "${1}" ; return 1 ; }
78210898Sroland.mainz@nrubsig.org
78310898Sroland.mainz@nrubsig.org			# send HTTP request
78410898Sroland.mainz@nrubsig.org			request="GET /${path} HTTP/1.1\r\n"
78510898Sroland.mainz@nrubsig.org			request+="Host: ${host}\r\n"
786*12068SRoger.Faulkner@Oracle.COM			request+="User-Agent: crawlsrccomments/ksh93 (2010-03-27; $(uname -s -r -p))\r\n"
78710898Sroland.mainz@nrubsig.org			request+="Connection: close\r\n"
78810898Sroland.mainz@nrubsig.org			print -n -- "${request}\r\n" >&${netfd}
7898462SApril.Chin@Sun.COM
79010898Sroland.mainz@nrubsig.org			# collect response and send it to stdout
79110898Sroland.mainz@nrubsig.org			parse_http_response httpresponse <&${netfd}
79210898Sroland.mainz@nrubsig.org			cat_http_body "${httpresponse.transfer_encoding}" <&${netfd}
7938462SApril.Chin@Sun.COM
79410898Sroland.mainz@nrubsig.org			# close connection
79510898Sroland.mainz@nrubsig.org			redirect {netfd}<&-
79610898Sroland.mainz@nrubsig.org
79710898Sroland.mainz@nrubsig.org			return 0
79810898Sroland.mainz@nrubsig.org		fi
79910898Sroland.mainz@nrubsig.org	else
80010898Sroland.mainz@nrubsig.org		return 1
80110898Sroland.mainz@nrubsig.org	fi
80210898Sroland.mainz@nrubsig.org	# notreached
8038462SApril.Chin@Sun.COM}
8048462SApril.Chin@Sun.COM
8058462SApril.Chin@Sun.COMfunction print_stats
8068462SApril.Chin@Sun.COM{
8078462SApril.Chin@Sun.COM	set -o errexit
8088462SApril.Chin@Sun.COM
8098462SApril.Chin@Sun.COM	# gather some statistics
81010898Sroland.mainz@nrubsig.org	compound stats=(
8118462SApril.Chin@Sun.COM		integer files_with_comments=0
8128462SApril.Chin@Sun.COM		integer files_without_comments=0
8138462SApril.Chin@Sun.COM
8148462SApril.Chin@Sun.COM		integer files_without_known_format=0
8158462SApril.Chin@Sun.COM
8168462SApril.Chin@Sun.COM		integer files_with_license_info=0
8178462SApril.Chin@Sun.COM		integer files_without_license_info=0
8188462SApril.Chin@Sun.COM
8198462SApril.Chin@Sun.COM		integer total_num_files=0
8208462SApril.Chin@Sun.COM	)
8218462SApril.Chin@Sun.COM
8228462SApril.Chin@Sun.COM	for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
8238462SApril.Chin@Sun.COM		if "${records[$i].comments_parsed}" ; then
8248462SApril.Chin@Sun.COM			(( stats.files_with_comments++ ))
8258462SApril.Chin@Sun.COM		else
8268462SApril.Chin@Sun.COM			(( stats.files_without_comments++ ))
8278462SApril.Chin@Sun.COM		fi
8288462SApril.Chin@Sun.COM
8298462SApril.Chin@Sun.COM		if ! "${records[$i].fileformat_found}" ; then
8308462SApril.Chin@Sun.COM			(( stats.files_without_known_format++ ))
8318462SApril.Chin@Sun.COM		fi
8328462SApril.Chin@Sun.COM
8338462SApril.Chin@Sun.COM		if "${records[$i].license_info_found}" ; then
8348462SApril.Chin@Sun.COM			(( stats.files_with_license_info++ ))
8358462SApril.Chin@Sun.COM		else
8368462SApril.Chin@Sun.COM			(( stats.files_without_license_info++ ))
8378462SApril.Chin@Sun.COM		fi
8388462SApril.Chin@Sun.COM
8398462SApril.Chin@Sun.COM		(( stats.total_num_files++ ))
8408462SApril.Chin@Sun.COM	done
8418462SApril.Chin@Sun.COM
84210898Sroland.mainz@nrubsig.org	print -v stats
8438462SApril.Chin@Sun.COM	return 0
8448462SApril.Chin@Sun.COM}
8458462SApril.Chin@Sun.COM
8468462SApril.Chin@Sun.COM
8478462SApril.Chin@Sun.COMfunction print_comments_plain
8488462SApril.Chin@Sun.COM{
8498462SApril.Chin@Sun.COM	set -o errexit
8508462SApril.Chin@Sun.COM
8518462SApril.Chin@Sun.COM	nameref records=$1
8528462SApril.Chin@Sun.COM	nameref options=$2
8538462SApril.Chin@Sun.COM	typeset i j
8548462SApril.Chin@Sun.COM
8558462SApril.Chin@Sun.COM	for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
8568462SApril.Chin@Sun.COM		nameref node=records[$i]
8578462SApril.Chin@Sun.COM
8588462SApril.Chin@Sun.COM		if [[ "${options.filepattern.accept}" != "" ]] && \
8598462SApril.Chin@Sun.COM		   [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then
8608462SApril.Chin@Sun.COM			continue
8618462SApril.Chin@Sun.COM		fi
8628462SApril.Chin@Sun.COM		if [[ "${options.filepattern.reject}" != "" ]] && \
8638462SApril.Chin@Sun.COM		   [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then
8648462SApril.Chin@Sun.COM			continue
8658462SApril.Chin@Sun.COM		fi
8668462SApril.Chin@Sun.COM
8678462SApril.Chin@Sun.COM		node.license_info_found=false
8688462SApril.Chin@Sun.COM
8698462SApril.Chin@Sun.COM		if ! "${node.comments_parsed}" ; then
8708462SApril.Chin@Sun.COM			continue
8718462SApril.Chin@Sun.COM		fi
8728462SApril.Chin@Sun.COM
8738462SApril.Chin@Sun.COM		for j in "${!node.comments[@]}" ; do
8748462SApril.Chin@Sun.COM			typeset s="${node.comments[$j]}"
8758462SApril.Chin@Sun.COM			typeset match=false
8768462SApril.Chin@Sun.COM
8778462SApril.Chin@Sun.COM			if [[ "${options.commentpattern.accept}" != "" ]] && \
8788462SApril.Chin@Sun.COM		   	   [[ "$s" == ${options.commentpattern.accept} ]] ; then
8798462SApril.Chin@Sun.COM				match=true
8808462SApril.Chin@Sun.COM			fi
8818462SApril.Chin@Sun.COM			if [[ "${options.commentpattern.reject}" != "" ]] && \
8828462SApril.Chin@Sun.COM	  		   [[ "$s" == ${options.commentpattern.reject} ]] ; then
8838462SApril.Chin@Sun.COM				match=false
8848462SApril.Chin@Sun.COM			fi
8858462SApril.Chin@Sun.COM
8868462SApril.Chin@Sun.COM			if "${match}" ; then
8878462SApril.Chin@Sun.COM				printf "\f#### filename='%s',\tcomment=%s\n" "${node.filename}" "$j"
8888462SApril.Chin@Sun.COM				printf "%s\n" "$s"
8898462SApril.Chin@Sun.COM				node.license_info_found=true
8908462SApril.Chin@Sun.COM			fi
8918462SApril.Chin@Sun.COM		done
8928462SApril.Chin@Sun.COM
8938462SApril.Chin@Sun.COM		if ! "${node.license_info_found}" ; then
8948462SApril.Chin@Sun.COM			printf "## no match found in '%s'," "${node.filename}"
8958462SApril.Chin@Sun.COM			printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \
8968462SApril.Chin@Sun.COM				"${node.comments_parsed}" \
8978462SApril.Chin@Sun.COM				"${node.fileformat_found}" \
8988462SApril.Chin@Sun.COM				"${node.file_format}"
8998462SApril.Chin@Sun.COM		fi
9008462SApril.Chin@Sun.COM	done
9018462SApril.Chin@Sun.COM
9028462SApril.Chin@Sun.COM	return 0
9038462SApril.Chin@Sun.COM}
9048462SApril.Chin@Sun.COM
9058462SApril.Chin@Sun.COMfunction print_comments_duplicates_compressed
9068462SApril.Chin@Sun.COM{
9078462SApril.Chin@Sun.COM	set -o errexit
9088462SApril.Chin@Sun.COM
9098462SApril.Chin@Sun.COM	nameref records=$1
9108462SApril.Chin@Sun.COM	nameref options=$2
9118462SApril.Chin@Sun.COM	typeset i j
9128462SApril.Chin@Sun.COM	typeset -A hashed_comments
9138462SApril.Chin@Sun.COM	integer num_hashed_comments
9148462SApril.Chin@Sun.COM
9158462SApril.Chin@Sun.COM	for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
9168462SApril.Chin@Sun.COM		nameref node=records[$i]
9178462SApril.Chin@Sun.COM
9188462SApril.Chin@Sun.COM		if [[ "${options.filepattern.accept}" != "" ]] && \
9198462SApril.Chin@Sun.COM		   [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then
9208462SApril.Chin@Sun.COM			continue
9218462SApril.Chin@Sun.COM		fi
9228462SApril.Chin@Sun.COM		if [[ "${options.filepattern.reject}" != "" ]] && \
9238462SApril.Chin@Sun.COM		   [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then
9248462SApril.Chin@Sun.COM			continue
9258462SApril.Chin@Sun.COM		fi
9268462SApril.Chin@Sun.COM
9278462SApril.Chin@Sun.COM		node.license_info_found=false
9288462SApril.Chin@Sun.COM
9298462SApril.Chin@Sun.COM		if ! "${node.comments_parsed}" ; then
9308462SApril.Chin@Sun.COM			continue
9318462SApril.Chin@Sun.COM		fi
9328462SApril.Chin@Sun.COM
9338462SApril.Chin@Sun.COM		for j in "${!node.comments[@]}" ; do
9348462SApril.Chin@Sun.COM			typeset s="${node.comments[$j]}"
9358462SApril.Chin@Sun.COM			typeset match=false
9368462SApril.Chin@Sun.COM
9378462SApril.Chin@Sun.COM			if [[ "${options.commentpattern.accept}" != "" ]] && \
9388462SApril.Chin@Sun.COM		   	   [[ "$s" == ${options.commentpattern.accept} ]] ; then
9398462SApril.Chin@Sun.COM				match=true
9408462SApril.Chin@Sun.COM			fi
9418462SApril.Chin@Sun.COM			if [[ "${options.commentpattern.reject}" != "" ]] && \
9428462SApril.Chin@Sun.COM	  		   [[ "$s" == ${options.commentpattern.reject} ]] ; then
9438462SApril.Chin@Sun.COM				match=false
9448462SApril.Chin@Sun.COM			fi
9458462SApril.Chin@Sun.COM
9468462SApril.Chin@Sun.COM
9478462SApril.Chin@Sun.COM			if "${match}" ; then
9488462SApril.Chin@Sun.COM				typeset -l hashstring # lowercase
9498462SApril.Chin@Sun.COM
9508462SApril.Chin@Sun.COM				# compress the comment (e.g. convert whiteapces and '.,:;()"' to newline characters) ...
9518462SApril.Chin@Sun.COM				hashstring="${s//+([\n\r\t\v*#.,:;\(\)\"[:space:][:blank:]])/${ch.newline}}"
9528462SApril.Chin@Sun.COM				# ... and then create a MD5 hash from this string
9538462SApril.Chin@Sun.COM				hash="$(sum -x md5 <<<"${hashstring}")"
9548462SApril.Chin@Sun.COM
9558462SApril.Chin@Sun.COM				nameref hc_node=hashed_comments[${hash}]
9568462SApril.Chin@Sun.COM
9578462SApril.Chin@Sun.COM				if [[ "${hc_node}" == "" ]] ; then
9588462SApril.Chin@Sun.COM					# build node if there isn't one yet
9598462SApril.Chin@Sun.COM					typeset -a hc_node.fileids
9608462SApril.Chin@Sun.COM					typeset    hc_node.comment="$s"
9618462SApril.Chin@Sun.COM				fi
9628462SApril.Chin@Sun.COM
9638462SApril.Chin@Sun.COM				hc_node.fileids+=( "$(printf "%s (md5='%s', sha1='%s')\n" "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}")" )
9648462SApril.Chin@Sun.COM
9658462SApril.Chin@Sun.COM				node.license_info_found=true
9668462SApril.Chin@Sun.COM			fi
9678462SApril.Chin@Sun.COM		done
9688462SApril.Chin@Sun.COM
9698462SApril.Chin@Sun.COM		if ! "${node.license_info_found}" ; then
9708462SApril.Chin@Sun.COM			printf "## no match found in "
9718462SApril.Chin@Sun.COM			printf "%s (md5='%s', sha1='%s'), " "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}"
9728462SApril.Chin@Sun.COM			printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \
9738462SApril.Chin@Sun.COM				"${node.comments_parsed}" \
9748462SApril.Chin@Sun.COM				"${node.fileformat_found}" \
9758462SApril.Chin@Sun.COM				"${node.file_format}"
9768462SApril.Chin@Sun.COM		fi
9778462SApril.Chin@Sun.COM	done
9788462SApril.Chin@Sun.COM
9798462SApril.Chin@Sun.COM	# print comments and all fileids (filename+hash sums) which include this comment
9808462SApril.Chin@Sun.COM	for i in "${!hashed_comments[@]}" ; do
9818462SApril.Chin@Sun.COM		printf "\f## The comment (ID=%s) ..." "${i}"
9828462SApril.Chin@Sun.COM		printf "\n-- snip --"
9838462SApril.Chin@Sun.COM		printf "\n%s" "${hashed_comments[${i}].comment}"
9848462SApril.Chin@Sun.COM		printf "\n-- snip --"
9858462SApril.Chin@Sun.COM		printf "\n... applies to the following files:\n"
9868462SApril.Chin@Sun.COM		printf "\t%s\n" "${hashed_comments[${i}].fileids[@]}" # printf repeats the format string for each array memeber
9878462SApril.Chin@Sun.COM	done
9888462SApril.Chin@Sun.COM
9898462SApril.Chin@Sun.COM	return 0
9908462SApril.Chin@Sun.COM}
9918462SApril.Chin@Sun.COM
9928462SApril.Chin@Sun.COMfunction do_crawl
9938462SApril.Chin@Sun.COM{
9948462SApril.Chin@Sun.COM	set -o errexit
9958462SApril.Chin@Sun.COM
99610898Sroland.mainz@nrubsig.org	compound options=(
9978462SApril.Chin@Sun.COM		integer max_filesize_for_scan=$((256*1024))
9988462SApril.Chin@Sun.COM		integer max_num_comments=$((2**62)) # FIXME: This should be "+Inf" (=Infinite)
9998462SApril.Chin@Sun.COM	)
10008462SApril.Chin@Sun.COM
10018462SApril.Chin@Sun.COM	shift
10028462SApril.Chin@Sun.COM	while getopts -a "${progname}" "${do_crawl_usage}" OPT "$@" ; do
10038462SApril.Chin@Sun.COM		printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
10048462SApril.Chin@Sun.COM		case ${OPT} in
10058462SApril.Chin@Sun.COM			S)	options.max_filesize_for_scan="${OPTARG}"  ;;
10068462SApril.Chin@Sun.COM			N)	options.max_num_comments="${OPTARG}"  ;;
10078462SApril.Chin@Sun.COM			*)	usage do_crawl_usage ;;
10088462SApril.Chin@Sun.COM		esac
10098462SApril.Chin@Sun.COM	done
10108462SApril.Chin@Sun.COM	shift $((OPTIND-1))
10118462SApril.Chin@Sun.COM
101210898Sroland.mainz@nrubsig.org	compound scan=(
10138462SApril.Chin@Sun.COM		typeset -A records
10148462SApril.Chin@Sun.COM	)
10158462SApril.Chin@Sun.COM
10168462SApril.Chin@Sun.COM	# read filenames from stdin
10178462SApril.Chin@Sun.COM	while read i ; do
10188462SApril.Chin@Sun.COM		printf "## scanning %s ...\n" "$i"
10198462SApril.Chin@Sun.COM		extract_comments scan.records "$i" ${options.max_num_comments} ${options.max_filesize_for_scan} || true
10208462SApril.Chin@Sun.COM	done
10218462SApril.Chin@Sun.COM
10228462SApril.Chin@Sun.COM	# print compound variable array (we strip the "typeset -A records" for now)
102310898Sroland.mainz@nrubsig.org	print -v scan >"crawlsrccomments_extracted_comments.cpv"
10248462SApril.Chin@Sun.COM
10258462SApril.Chin@Sun.COM	print "# Wrote results to crawlsrccomments_extracted_comments.cpv"
10268462SApril.Chin@Sun.COM
10278462SApril.Chin@Sun.COM	return 0
10288462SApril.Chin@Sun.COM}
10298462SApril.Chin@Sun.COM
10308462SApril.Chin@Sun.COMfunction do_getcomments
10318462SApril.Chin@Sun.COM{
10328462SApril.Chin@Sun.COM	set -o errexit
10338462SApril.Chin@Sun.COM
10348462SApril.Chin@Sun.COM	# vars
103510898Sroland.mainz@nrubsig.org	compound scan
10368462SApril.Chin@Sun.COM	typeset database
10378462SApril.Chin@Sun.COM	typeset tmp
10388462SApril.Chin@Sun.COM
103910898Sroland.mainz@nrubsig.org	compound options=(
10408462SApril.Chin@Sun.COM		typeset database="crawlsrccomments_extracted_comments.cpv"
10418462SApril.Chin@Sun.COM
10428462SApril.Chin@Sun.COM		typeset print_stats=false
10438462SApril.Chin@Sun.COM		typeset zapduplicates=false
104410898Sroland.mainz@nrubsig.org		compound filepattern=(
10458462SApril.Chin@Sun.COM			typeset accept="*"
10468462SApril.Chin@Sun.COM			typeset reject=""
10478462SApril.Chin@Sun.COM		)
104810898Sroland.mainz@nrubsig.org		compound commentpattern=(
10498462SApril.Chin@Sun.COM			typeset accept="~(Ei)(license|copyright)"
10508462SApril.Chin@Sun.COM			typeset reject=""
10518462SApril.Chin@Sun.COM		)
10528462SApril.Chin@Sun.COM	)
10538462SApril.Chin@Sun.COM
10548462SApril.Chin@Sun.COM	shift
10558462SApril.Chin@Sun.COM	while getopts -a "${progname}" "${do_getcomments_usage}" OPT "$@" ; do
10568462SApril.Chin@Sun.COM	#    printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
10578462SApril.Chin@Sun.COM		case ${OPT} in
10588462SApril.Chin@Sun.COM			c)	options.commentpattern.accept="${OPTARG}" ;;
10598462SApril.Chin@Sun.COM			C)	options.commentpattern.reject="${OPTARG}" ;;
10608462SApril.Chin@Sun.COM			D)	options.database="${OPTARG}" ;;
10618462SApril.Chin@Sun.COM			l)	options.filepattern.accept="${OPTARG}" ;;
10628462SApril.Chin@Sun.COM			L)	options.filepattern.reject="${OPTARG}" ;;
10638462SApril.Chin@Sun.COM			S)	options.print_stats=true ;;
10648462SApril.Chin@Sun.COM			+S)	options.print_stats=false ;;
10658462SApril.Chin@Sun.COM			Z)	options.zapduplicates=true ;;
10668462SApril.Chin@Sun.COM			+Z)	options.zapduplicates=false ;;
10678462SApril.Chin@Sun.COM			*)	usage do_getcomments_usage ;;
10688462SApril.Chin@Sun.COM		esac
10698462SApril.Chin@Sun.COM	done
10708462SApril.Chin@Sun.COM	shift $((OPTIND-1))
10718462SApril.Chin@Sun.COM
10728462SApril.Chin@Sun.COM	# array of temporary files which should be cleaned-up upon exit
10738462SApril.Chin@Sun.COM	typeset -a tmpfiles
10748462SApril.Chin@Sun.COM	trap 'set -o errexit ; print -u2 "# Cleaning up..." ; ((${#tmpfiles[@]} > 0)) && rm -- "${tmpfiles[@]}" ; print -u2 "# Done."' EXIT
10758462SApril.Chin@Sun.COM
10768462SApril.Chin@Sun.COM	# Support for HTTP URLs
107710898Sroland.mainz@nrubsig.org	if [[ "${options.database}" == ~(El)(http|https)://.* ]] ; then
107810898Sroland.mainz@nrubsig.org		database="/tmp/extract_license_cat_url_${PPID}_$$.tmp"
10798462SApril.Chin@Sun.COM		tmpfiles+=( "${database}" )
10808462SApril.Chin@Sun.COM		print -u2 "# Loading URL..."
108110898Sroland.mainz@nrubsig.org		cat_url "${options.database}" >"${database}"
10828462SApril.Chin@Sun.COM		print -u2 "# Loading URL done."
10838462SApril.Chin@Sun.COM	else
10848462SApril.Chin@Sun.COM		database="${options.database}"
10858462SApril.Chin@Sun.COM	fi
10868462SApril.Chin@Sun.COM
10878462SApril.Chin@Sun.COM	if [[ ! -r "${database}" ]] ; then
10888462SApril.Chin@Sun.COM		fatal_error "Can't read ${database}."
10898462SApril.Chin@Sun.COM	fi
10908462SApril.Chin@Sun.COM
10918462SApril.Chin@Sun.COM	# Support for compressed database files
10928462SApril.Chin@Sun.COM	case "$(LC_ALL=C /usr/bin/file "${database}")" in
10938462SApril.Chin@Sun.COM		*bzip2*)
10948462SApril.Chin@Sun.COM			tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp"
10958462SApril.Chin@Sun.COM			tmpfiles+=( "${tmp}" )
10968462SApril.Chin@Sun.COM			print -u2 "# Uncompressing data (bzip2) ..."
10978462SApril.Chin@Sun.COM			bzcat <"${database}" >"${tmp}"
10988462SApril.Chin@Sun.COM			print -u2 "# Uncompression done."
10998462SApril.Chin@Sun.COM			database="${tmp}"
11008462SApril.Chin@Sun.COM			;;
11018462SApril.Chin@Sun.COM		*gzip*)
11028462SApril.Chin@Sun.COM			tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp"
11038462SApril.Chin@Sun.COM			tmpfiles+=( "${tmp}" )
11048462SApril.Chin@Sun.COM			print -u2 "# Uncompressing data (gzip) ..."
11058462SApril.Chin@Sun.COM			gunzip -c <"${database}" >"${tmp}"
11068462SApril.Chin@Sun.COM			print -u2 "# Uncompression done."
11078462SApril.Chin@Sun.COM			database="${tmp}"
11088462SApril.Chin@Sun.COM			;;
11098462SApril.Chin@Sun.COM	esac
11108462SApril.Chin@Sun.COM
11118462SApril.Chin@Sun.COM	# Read compound variable which contain all recorded comments
11128462SApril.Chin@Sun.COM	print -u2 "# reading records..."
111310898Sroland.mainz@nrubsig.org	read -C scan <"${database}" || fatal_error 'Error reading data.'
11148462SApril.Chin@Sun.COM	print -u2 -f "# reading %d records done.\n" "${#scan.records[@]}"
11158462SApril.Chin@Sun.COM
11168462SApril.Chin@Sun.COM	# print comments
11178462SApril.Chin@Sun.COM	print -u2 "# processing data..."
11188462SApril.Chin@Sun.COM	print "## comments start:"
11198462SApril.Chin@Sun.COM	if "${options.zapduplicates}" ; then
11208462SApril.Chin@Sun.COM		print_comments_duplicates_compressed scan.records options
11218462SApril.Chin@Sun.COM	else
11228462SApril.Chin@Sun.COM		print_comments_plain scan.records options
11238462SApril.Chin@Sun.COM	fi
11248462SApril.Chin@Sun.COM	print "## comments end"
11258462SApril.Chin@Sun.COM	print -u2 "# processing data done."
11268462SApril.Chin@Sun.COM
11278462SApril.Chin@Sun.COM	if "${options.print_stats}" ; then
11288462SApril.Chin@Sun.COM		print_stats
11298462SApril.Chin@Sun.COM	fi
11308462SApril.Chin@Sun.COM
11318462SApril.Chin@Sun.COM	return 0
11328462SApril.Chin@Sun.COM}
11338462SApril.Chin@Sun.COM
11348462SApril.Chin@Sun.COMfunction usage
11358462SApril.Chin@Sun.COM{
11368462SApril.Chin@Sun.COM	nameref usagemsg=$1
11378462SApril.Chin@Sun.COM	OPTIND=0
11388462SApril.Chin@Sun.COM	getopts -a "${progname}" "${usagemsg}" OPT '-?'
11398462SApril.Chin@Sun.COM	exit 2
11408462SApril.Chin@Sun.COM}
11418462SApril.Chin@Sun.COM
11428462SApril.Chin@Sun.COMtypeset -r do_getcomments_usage=$'+
1143*12068SRoger.Faulkner@Oracle.COM[-?\n@(#)\$Id: getcomments (Roland Mainz) 2010-03-27 \$\n]
11448462SApril.Chin@Sun.COM[-author?Roland Mainz <roland.mainz@sun.com>]
1145*12068SRoger.Faulkner@Oracle.COM[-author?Roland Mainz <roland.mainz@nrubsig.org>]
11468462SApril.Chin@Sun.COM[+NAME?getcomments - extract license information from source files]
11478462SApril.Chin@Sun.COM[+DESCRIPTION?\bgetcomments\b is a small utilty script which extracts
11488462SApril.Chin@Sun.COM	license information from the "\bgetcomments\b"-database
11498462SApril.Chin@Sun.COM	file created by \bcrawl\b. The script allows various
11508462SApril.Chin@Sun.COM	filters (see options below) to be applied on the database]
11518462SApril.Chin@Sun.COM[+?The license extraction is done in two steps - first a crawler script
11528462SApril.Chin@Sun.COM	called \bcrawl\b will scan all source files, extract
11538462SApril.Chin@Sun.COM	the comments and stores this information in a "database" file called
11548462SApril.Chin@Sun.COM	"crawlsrccomments_extracted_comments.cpv" and then \bextract_license\b allows
11558462SApril.Chin@Sun.COM	queries on this database.]
115610898Sroland.mainz@nrubsig.org[D:database?Database file for input (either file, http:// or https://-URL).]:[database]
11578462SApril.Chin@Sun.COM[l:acceptfilepattern?Process only files which match pattern.]:[pattern]
11588462SApril.Chin@Sun.COM[L:rejectfilepattern?Process only files which do not match pattern.]:[pattern]
11598462SApril.Chin@Sun.COM[c:acceptcommentpattern?Match comments which match pattern. Defaults to ~(Ei)(license|copyright)]:[pattern]
11608462SApril.Chin@Sun.COM[C:rejectcommentpattern?Discard comments which match pattern. Defaults to ""]:[pattern]
11618462SApril.Chin@Sun.COM[S:stats?Print statistics.]
11628462SApril.Chin@Sun.COM[Z:zapsimilar?Combine similar/duplicate comments in the report.]
11638462SApril.Chin@Sun.COM[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
11648462SApril.Chin@Sun.COM'
11658462SApril.Chin@Sun.COM
11668462SApril.Chin@Sun.COMtypeset -r do_crawl_usage=$'+
1167*12068SRoger.Faulkner@Oracle.COM[-?\n@(#)\$Id: crawl (Roland Mainz) 2010-03-27 \$\n]
11688462SApril.Chin@Sun.COM[-author?Roland Mainz <roland.mainz@sun.com>]
1169*12068SRoger.Faulkner@Oracle.COM[-author?Roland Mainz <roland.mainz@nrubsig.org>]
11708462SApril.Chin@Sun.COM[+NAME?crawl - crawl comment information from source files]
11718462SApril.Chin@Sun.COM[+DESCRIPTION?\bcrawl\b is a small utilty script which reads
11728462SApril.Chin@Sun.COM	a list of source code files from stdin, determinates the type of
11738462SApril.Chin@Sun.COM	syntax used by these files and then extracts
11748462SApril.Chin@Sun.COM	comments from the source code and stores this information into a
11758462SApril.Chin@Sun.COM	"database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then
11768462SApril.Chin@Sun.COM	be processed by \bextract_license\b or similar processing tools.]
11778462SApril.Chin@Sun.COM[S:scanmaxcharacters?Scan a maximum number of numchars characters for comments.
11788462SApril.Chin@Sun.COM	Defaults to 256K characters.]:[numchars]
11798462SApril.Chin@Sun.COM[N:maxnumcomments?Maximum numbers of comments to crawl. Defaults to "+Infinite"]:[numcomments]
11808462SApril.Chin@Sun.COM[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
11818462SApril.Chin@Sun.COM'
11828462SApril.Chin@Sun.COM
11838462SApril.Chin@Sun.COMtypeset -r crawlsrccomments_usage=$'+
1184*12068SRoger.Faulkner@Oracle.COM[-?\n@(#)\$Id: crawlsrccomments (Roland Mainz) 2010-03-27 \$\n]
11858462SApril.Chin@Sun.COM[-author?Roland Mainz <roland.mainz@sun.com>]
1186*12068SRoger.Faulkner@Oracle.COM[-author?Roland Mainz <roland.mainz@nrubsig.org>]
11878462SApril.Chin@Sun.COM[+NAME?crawlsrccomments - extract and filter comment information from source files]
11888462SApril.Chin@Sun.COM[+DESCRIPTION?\bcrawlsrccomments\b is a small utilty script which reads
11898462SApril.Chin@Sun.COM	a list of source code files from stdin, determinates the type of
11908462SApril.Chin@Sun.COM	syntax used by these files and then extracts
11918462SApril.Chin@Sun.COM	comments from the source code and stores this information into a
11928462SApril.Chin@Sun.COM	"database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then
11938462SApril.Chin@Sun.COM	be processed by \bextract_license\b or similar processing tools.]
11948462SApril.Chin@Sun.COM
11958462SApril.Chin@Sun.COM[crawl|getcomments] options
11968462SApril.Chin@Sun.COM
11978462SApril.Chin@Sun.COM[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
11988462SApril.Chin@Sun.COM'
11998462SApril.Chin@Sun.COM
12008462SApril.Chin@Sun.COM
12018462SApril.Chin@Sun.COM# program start
12028462SApril.Chin@Sun.COMbuiltin basename
12038462SApril.Chin@Sun.COMbuiltin cat
12048462SApril.Chin@Sun.COMbuiltin date
12058462SApril.Chin@Sun.COMbuiltin uname
12068462SApril.Chin@Sun.COMbuiltin rm
12078462SApril.Chin@Sun.COMbuiltin sum || fatal_error "sum builtin not found."
12088462SApril.Chin@Sun.COM
12098462SApril.Chin@Sun.COM# exit at the first error we hit
12108462SApril.Chin@Sun.COMset -o errexit
12118462SApril.Chin@Sun.COM
12128462SApril.Chin@Sun.COMtypeset progname="${ basename "${0}" ; }"
12138462SApril.Chin@Sun.COM
12148462SApril.Chin@Sun.COMwhile getopts -a "${progname}" "${crawlsrccomments_usage}" OPT ; do
12158462SApril.Chin@Sun.COM	# printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
12168462SApril.Chin@Sun.COM	case ${OPT} in
12178462SApril.Chin@Sun.COM		*)	usage crawlsrccomments_usage ;;
12188462SApril.Chin@Sun.COM	esac
12198462SApril.Chin@Sun.COMdone
12208462SApril.Chin@Sun.COMshift $((OPTIND-1))
12218462SApril.Chin@Sun.COM
12228462SApril.Chin@Sun.COMtypeset cmd="$1"
12238462SApril.Chin@Sun.COM
12248462SApril.Chin@Sun.COMcase "$cmd" in
12258462SApril.Chin@Sun.COM	"crawl")
12268462SApril.Chin@Sun.COM		progname+=" ${cmd}"
12278462SApril.Chin@Sun.COM		do_crawl "$@"
12288462SApril.Chin@Sun.COM		exit $?
12298462SApril.Chin@Sun.COM		;;
12308462SApril.Chin@Sun.COM	"getcomments")
12318462SApril.Chin@Sun.COM		progname+=" ${cmd}"
12328462SApril.Chin@Sun.COM		do_getcomments "$@"
12338462SApril.Chin@Sun.COM		exit $?
12348462SApril.Chin@Sun.COM		;;
12358462SApril.Chin@Sun.COM	*)
12368462SApril.Chin@Sun.COM		usage crawlsrccomments_usage
12378462SApril.Chin@Sun.COM		;;
12388462SApril.Chin@Sun.COMesac
12398462SApril.Chin@Sun.COM
12408462SApril.Chin@Sun.COMfatal_error "not reached."
12418462SApril.Chin@Sun.COM# EOF.
1242