18462SApril.Chin@Sun.COM#!/usr/bin/ksh93
28462SApril.Chin@Sun.COM
38462SApril.Chin@Sun.COM#
48462SApril.Chin@Sun.COM# CDDL HEADER START
58462SApril.Chin@Sun.COM#
68462SApril.Chin@Sun.COM# The contents of this file are subject to the terms of the
78462SApril.Chin@Sun.COM# Common Development and Distribution License (the "License").
88462SApril.Chin@Sun.COM# You may not use this file except in compliance with the License.
98462SApril.Chin@Sun.COM#
108462SApril.Chin@Sun.COM# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
118462SApril.Chin@Sun.COM# or http://www.opensolaris.org/os/licensing.
128462SApril.Chin@Sun.COM# See the License for the specific language governing permissions
138462SApril.Chin@Sun.COM# and limitations under the License.
148462SApril.Chin@Sun.COM#
158462SApril.Chin@Sun.COM# When distributing Covered Code, include this CDDL HEADER in each
168462SApril.Chin@Sun.COM# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
178462SApril.Chin@Sun.COM# If applicable, add the following below this CDDL HEADER, with the
188462SApril.Chin@Sun.COM# fields enclosed by brackets "[]" replaced with your own identifying
198462SApril.Chin@Sun.COM# information: Portions Copyright [yyyy] [name of copyright owner]
208462SApril.Chin@Sun.COM#
218462SApril.Chin@Sun.COM# CDDL HEADER END
228462SApril.Chin@Sun.COM#
238462SApril.Chin@Sun.COM
248462SApril.Chin@Sun.COM#
25*10898Sroland.mainz@nrubsig.org# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
268462SApril.Chin@Sun.COM# Use is subject to license terms.
278462SApril.Chin@Sun.COM#
288462SApril.Chin@Sun.COM
298462SApril.Chin@Sun.COM# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant
308462SApril.Chin@Sun.COMexport PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin
318462SApril.Chin@Sun.COM
328462SApril.Chin@Sun.COM# Make sure all math stuff runs in the "C" locale to avoid problems
338462SApril.Chin@Sun.COM# with alternative # radix point representations (e.g. ',' instead of
348462SApril.Chin@Sun.COM# '.' in de_DE.*-locales). This needs to be set _before_ any
358462SApril.Chin@Sun.COM# floating-point constants are defined in this script).
368462SApril.Chin@Sun.COMif [[ "${LC_ALL}" != "" ]] ; then
378462SApril.Chin@Sun.COM    export \
388462SApril.Chin@Sun.COM        LC_MONETARY="${LC_ALL}" \
398462SApril.Chin@Sun.COM        LC_MESSAGES="${LC_ALL}" \
408462SApril.Chin@Sun.COM        LC_COLLATE="${LC_ALL}" \
418462SApril.Chin@Sun.COM        LC_CTYPE="${LC_ALL}"
428462SApril.Chin@Sun.COM        unset LC_ALL
438462SApril.Chin@Sun.COMfi
448462SApril.Chin@Sun.COMexport LC_NUMERIC=C
458462SApril.Chin@Sun.COM
468462SApril.Chin@Sun.COM# constants values for tokenizer/parser stuff
47*10898Sroland.mainz@nrubsig.orgcompound -r ch=(
488462SApril.Chin@Sun.COM	newline=$'\n'
498462SApril.Chin@Sun.COM	tab=$'\t'
508462SApril.Chin@Sun.COM	formfeed=$'\f'
518462SApril.Chin@Sun.COM)
528462SApril.Chin@Sun.COM
538462SApril.Chin@Sun.COMfunction fatal_error
548462SApril.Chin@Sun.COM{
558462SApril.Chin@Sun.COM	print -u2 "${progname}: $*"
568462SApril.Chin@Sun.COM	exit 1
578462SApril.Chin@Sun.COM}
588462SApril.Chin@Sun.COM
598462SApril.Chin@Sun.COMfunction printmsg
608462SApril.Chin@Sun.COM{
618462SApril.Chin@Sun.COM	print -u2 "$*"
628462SApril.Chin@Sun.COM}
638462SApril.Chin@Sun.COM
648462SApril.Chin@Sun.COM
658462SApril.Chin@Sun.COMfunction attrstrtoattrarray
668462SApril.Chin@Sun.COM{
678462SApril.Chin@Sun.COM#set -o xtrace
688462SApril.Chin@Sun.COM    typeset s="$1"
698462SApril.Chin@Sun.COM    nameref aa=$2 # attribute array
708462SApril.Chin@Sun.COM    integer aa_count=0
718462SApril.Chin@Sun.COM    integer aa_count=0
728462SApril.Chin@Sun.COM    typeset nextattr
738462SApril.Chin@Sun.COM    integer currattrlen=0
748462SApril.Chin@Sun.COM    typeset tagstr
758462SApril.Chin@Sun.COM    typeset tagval
768462SApril.Chin@Sun.COM
778462SApril.Chin@Sun.COM    while (( ${#s} > 0 )) ; do
788462SApril.Chin@Sun.COM        # skip whitespaces
798462SApril.Chin@Sun.COM        while [[ "${s:currattrlen:1}" == ~(E)[[:blank:][:space:]] ]] ; do
808462SApril.Chin@Sun.COM            (( currattrlen++ ))
818462SApril.Chin@Sun.COM        done
828462SApril.Chin@Sun.COM        s="${s:currattrlen:${#s}}"
838462SApril.Chin@Sun.COM
848462SApril.Chin@Sun.COM        # anything left ?
858462SApril.Chin@Sun.COM        (( ${#s} == 0 )) && break
868462SApril.Chin@Sun.COM
878462SApril.Chin@Sun.COM        # Pattern tests:
888462SApril.Chin@Sun.COM        #x="foo=bar huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=[^[:blank:]\"]*}"
898462SApril.Chin@Sun.COM        #x='foo="ba=r o" huz=123' ; print "${x##~(E)[[:alnum:]_-:]*=\"[^\"]*\"}"
908462SApril.Chin@Sun.COM        #x="foo='ba=r o' huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=\'[^\"]*\'}"
918462SApril.Chin@Sun.COM        #x="foox huz=123" ; print "${x##~(E)[[:alnum:]_-:]*}"
928462SApril.Chin@Sun.COM        # All pattern combined via eregex (w|x|y|z):
938462SApril.Chin@Sun.COM        #x='foo="bar=o" huz=123' ; print "${x##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\')}"
948462SApril.Chin@Sun.COM        nextattr="${s##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\'|[[:alnum:]_-:]*)}"
958462SApril.Chin@Sun.COM        currattrlen=$(( ${#s} - ${#nextattr}))
968462SApril.Chin@Sun.COM
978462SApril.Chin@Sun.COM        # add entry
988462SApril.Chin@Sun.COM        tagstr="${s:0:currattrlen}"
998462SApril.Chin@Sun.COM        if [[ "${tagstr}" == *=* ]] ; then
1008462SApril.Chin@Sun.COM            # normal case: attribute with value
1018462SApril.Chin@Sun.COM
1028462SApril.Chin@Sun.COM            tagval="${tagstr#*=}"
1038462SApril.Chin@Sun.COM
1048462SApril.Chin@Sun.COM            # strip quotes ('' or "")
1058462SApril.Chin@Sun.COM            if [[ "${tagval}" == ~(Elr)(\'.*\'|\".*\") ]] ; then
1068462SApril.Chin@Sun.COM                tagval="${tagval:1:${#tagval}-2}"
1078462SApril.Chin@Sun.COM            fi
1088462SApril.Chin@Sun.COM
1098462SApril.Chin@Sun.COM            aa[${aa_count}]=( name="${tagstr%%=*}" value="${tagval}" )
1108462SApril.Chin@Sun.COM        else
1118462SApril.Chin@Sun.COM            # special case for HTML where you have something like <foo baz>
1128462SApril.Chin@Sun.COM            aa[${aa_count}]=( name="${tagstr}" )
1138462SApril.Chin@Sun.COM        fi
1148462SApril.Chin@Sun.COM        (( aa_count++ ))
1158462SApril.Chin@Sun.COM        (( aa_count > 1000 )) && fatal_error "$0: aa_count too large" # assert
1168462SApril.Chin@Sun.COM    done
1178462SApril.Chin@Sun.COM}
1188462SApril.Chin@Sun.COM
1198462SApril.Chin@Sun.COM# XML document handler
1208462SApril.Chin@Sun.COMfunction handle_xml_document
1218462SApril.Chin@Sun.COM{
1228462SApril.Chin@Sun.COM#set -o xtrace
1238462SApril.Chin@Sun.COM    nameref callbacks=${1}
1248462SApril.Chin@Sun.COM    typeset tag_type="${2}"
1258462SApril.Chin@Sun.COM    typeset tag_value="${3}"
1268462SApril.Chin@Sun.COM    typeset tag_attributes="${4}"
1278462SApril.Chin@Sun.COM    nameref doc=${callbacks["arg_tree"]}
1288462SApril.Chin@Sun.COM    nameref nodepath="${stack.items[stack.pos]}"
1298462SApril.Chin@Sun.COM    nameref nodesnum="${stack.items[stack.pos]}num"
1308462SApril.Chin@Sun.COM
1318462SApril.Chin@Sun.COM    case "${tag_type}" in
1328462SApril.Chin@Sun.COM        tag_comment)
1338462SApril.Chin@Sun.COM            nodepath[${nodesnum}]+=(
1348462SApril.Chin@Sun.COM                typeset tagtype="comment"
1358462SApril.Chin@Sun.COM                typeset tagvalue="${tag_value}"
1368462SApril.Chin@Sun.COM            )
1378462SApril.Chin@Sun.COM            (( nodesnum++ ))
1388462SApril.Chin@Sun.COM            ;;
1398462SApril.Chin@Sun.COM    esac
1408462SApril.Chin@Sun.COM
1418462SApril.Chin@Sun.COM#    print "xmltok: '${tag_type}' = '${tag_value}'"
1428462SApril.Chin@Sun.COM}
1438462SApril.Chin@Sun.COM
1448462SApril.Chin@Sun.COMfunction xml_tok
1458462SApril.Chin@Sun.COM{
1468462SApril.Chin@Sun.COM    typeset buf=""
1478462SApril.Chin@Sun.COM    typeset namebuf=""
1488462SApril.Chin@Sun.COM    typeset attrbuf=""
1498462SApril.Chin@Sun.COM    typeset c=""
1508462SApril.Chin@Sun.COM    typeset isendtag # bool: true/false
1518462SApril.Chin@Sun.COM    typeset issingletag # bool: true/false (used for tags like "<br />")
1528462SApril.Chin@Sun.COM    nameref callbacks=${1}
1538462SApril.Chin@Sun.COM
1548462SApril.Chin@Sun.COM    [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start"
1558462SApril.Chin@Sun.COM
1568462SApril.Chin@Sun.COM    while IFS='' read -r -N 1 c ; do
1578462SApril.Chin@Sun.COM        isendtag=false
1588462SApril.Chin@Sun.COM
1598462SApril.Chin@Sun.COM        if [[ "$c" == "<" ]] ; then
1608462SApril.Chin@Sun.COM	    # flush any text content
1618462SApril.Chin@Sun.COM            if [[ "$buf" != "" ]] ; then
1628462SApril.Chin@Sun.COM                [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf"
1638462SApril.Chin@Sun.COM                buf=""
1648462SApril.Chin@Sun.COM            fi
1658462SApril.Chin@Sun.COM
1668462SApril.Chin@Sun.COM            IFS='' read -r -N 1 c
1678462SApril.Chin@Sun.COM            if [[ "$c" == "/" ]] ; then
1688462SApril.Chin@Sun.COM                isendtag=true
1698462SApril.Chin@Sun.COM            else
1708462SApril.Chin@Sun.COM                buf="$c"
1718462SApril.Chin@Sun.COM            fi
1728462SApril.Chin@Sun.COM            IFS='' read -r -d '>' c
1738462SApril.Chin@Sun.COM            buf+="$c"
1748462SApril.Chin@Sun.COM
1758462SApril.Chin@Sun.COM	    # handle comments
1768462SApril.Chin@Sun.COM	    if [[ "$buf" == ~(El)!-- ]] ; then
1778462SApril.Chin@Sun.COM	        # did we read the comment completely ?
1788462SApril.Chin@Sun.COM	        if [[ "$buf" != ~(Elr)!--.*-- ]] ; then
1798462SApril.Chin@Sun.COM		    buf+=">"
1808462SApril.Chin@Sun.COM	            while [[ "$buf" != ~(Elr)!--.*-- ]] ; do
1818462SApril.Chin@Sun.COM		        IFS='' read -r -N 1 c || break
1828462SApril.Chin@Sun.COM		        buf+="$c"
1838462SApril.Chin@Sun.COM		    done
1848462SApril.Chin@Sun.COM		fi
1858462SApril.Chin@Sun.COM
1868462SApril.Chin@Sun.COM		[[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}"
1878462SApril.Chin@Sun.COM		buf=""
1888462SApril.Chin@Sun.COM		continue
1898462SApril.Chin@Sun.COM	    fi
1908462SApril.Chin@Sun.COM
1918462SApril.Chin@Sun.COM	    # check if the tag starts and ends at the same time (like "<br />")
1928462SApril.Chin@Sun.COM	    if [[ "${buf}" == ~(Er).*/ ]] ; then
1938462SApril.Chin@Sun.COM	        issingletag=true
1948462SApril.Chin@Sun.COM		buf="${buf%*/}"
1958462SApril.Chin@Sun.COM	    else
1968462SApril.Chin@Sun.COM	        issingletag=false
1978462SApril.Chin@Sun.COM	    fi
1988462SApril.Chin@Sun.COM
1998462SApril.Chin@Sun.COM	    # check if the tag has attributes (e.g. space after name)
2008462SApril.Chin@Sun.COM	    if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then
2018462SApril.Chin@Sun.COM	        namebuf="${buf%%~(E)[[:space:][:blank:]].*}"
2028462SApril.Chin@Sun.COM                attrbuf="${buf#~(E).*[[:space:][:blank:]]}"
2038462SApril.Chin@Sun.COM            else
2048462SApril.Chin@Sun.COM	        namebuf="$buf"
2058462SApril.Chin@Sun.COM		attrbuf=""
2068462SApril.Chin@Sun.COM	    fi
2078462SApril.Chin@Sun.COM
2088462SApril.Chin@Sun.COM            if ${isendtag} ; then
2098462SApril.Chin@Sun.COM                [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
2108462SApril.Chin@Sun.COM            else
2118462SApril.Chin@Sun.COM                [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf"
2128462SApril.Chin@Sun.COM
2138462SApril.Chin@Sun.COM                # handle tags like <br/> (which are start- and end-tag in one piece)
2148462SApril.Chin@Sun.COM                if ${issingletag} ; then
2158462SApril.Chin@Sun.COM                    [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
2168462SApril.Chin@Sun.COM                fi
2178462SApril.Chin@Sun.COM            fi
2188462SApril.Chin@Sun.COM            buf=""
2198462SApril.Chin@Sun.COM        else
2208462SApril.Chin@Sun.COM            buf+="$c"
2218462SApril.Chin@Sun.COM        fi
2228462SApril.Chin@Sun.COM    done
2238462SApril.Chin@Sun.COM
2248462SApril.Chin@Sun.COM    [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success"
2258462SApril.Chin@Sun.COM
2268462SApril.Chin@Sun.COM    print # final newline to make filters like "sed" happy
2278462SApril.Chin@Sun.COM}
2288462SApril.Chin@Sun.COM
2298462SApril.Chin@Sun.COM# enumerate comments in a shell (or shell-like) script
2308462SApril.Chin@Sun.COMfunction enumerate_comments_shell
2318462SApril.Chin@Sun.COM{
2328462SApril.Chin@Sun.COM	set -o errexit
2338462SApril.Chin@Sun.COM
2348462SApril.Chin@Sun.COM	typeset input_file="$1"
2358462SApril.Chin@Sun.COM	nameref comment_array="$2"
2368462SApril.Chin@Sun.COM	integer max_num_comments="$3"
2378462SApril.Chin@Sun.COM	integer ca=0 # index in "comment_array"
2388462SApril.Chin@Sun.COM
2398462SApril.Chin@Sun.COM	integer res=0
2408462SApril.Chin@Sun.COM
2418462SApril.Chin@Sun.COM	typeset comment=""
2428462SApril.Chin@Sun.COM
2438462SApril.Chin@Sun.COM	while (( res == 0 )) ; do
2448462SApril.Chin@Sun.COM		IFS='' read -r line
2458462SApril.Chin@Sun.COM		(( res=$? ))
2468462SApril.Chin@Sun.COM
2478462SApril.Chin@Sun.COM		if [[ "${line}" == ~(El)#.* ]] ; then
2488462SApril.Chin@Sun.COM			comment+="${line#\#}${ch.newline}"
2498462SApril.Chin@Sun.COM		else
2508462SApril.Chin@Sun.COM			if [[ "$comment" != "" ]] ; then
2518462SApril.Chin@Sun.COM				comment_array[ca++]="${comment}"
2528462SApril.Chin@Sun.COM				comment=""
2538462SApril.Chin@Sun.COM
2548462SApril.Chin@Sun.COM				if (( ca > max_num_comments )) ; then
2558462SApril.Chin@Sun.COM					break
2568462SApril.Chin@Sun.COM				fi
2578462SApril.Chin@Sun.COM			fi
2588462SApril.Chin@Sun.COM		fi
2598462SApril.Chin@Sun.COM	done <"${input_file}"
2608462SApril.Chin@Sun.COM
2618462SApril.Chin@Sun.COM	return 0
2628462SApril.Chin@Sun.COM}
2638462SApril.Chin@Sun.COM
2648462SApril.Chin@Sun.COM
2658462SApril.Chin@Sun.COM# enumerate comments in a troff document
2668462SApril.Chin@Sun.COMfunction enumerate_comments_troff
2678462SApril.Chin@Sun.COM{
2688462SApril.Chin@Sun.COM	set -o errexit
2698462SApril.Chin@Sun.COM
2708462SApril.Chin@Sun.COM	typeset input_file="$1"
2718462SApril.Chin@Sun.COM	nameref comment_array="$2"
2728462SApril.Chin@Sun.COM	integer max_num_comments="$3"
2738462SApril.Chin@Sun.COM	integer ca=0 # index in "comment_array"
2748462SApril.Chin@Sun.COM
2758462SApril.Chin@Sun.COM	integer res=0
2768462SApril.Chin@Sun.COM
2778462SApril.Chin@Sun.COM	typeset comment=""
2788462SApril.Chin@Sun.COM
2798462SApril.Chin@Sun.COM	while (( res == 0 )) ; do
2808462SApril.Chin@Sun.COM		IFS='' read -r line
2818462SApril.Chin@Sun.COM		(( res=$? ))
2828462SApril.Chin@Sun.COM
2838462SApril.Chin@Sun.COM		if [[ "${line}" == ~(El)\.*\\\" ]] ; then
2848462SApril.Chin@Sun.COM			comment+="${line#~(El)\.*\\\"}${ch.newline}"
2858462SApril.Chin@Sun.COM		else
2868462SApril.Chin@Sun.COM			if [[ "$comment" != "" ]] ; then
2878462SApril.Chin@Sun.COM				comment_array[ca++]="${comment}"
2888462SApril.Chin@Sun.COM				comment=""
2898462SApril.Chin@Sun.COM
2908462SApril.Chin@Sun.COM				if (( ca > max_num_comments )) ; then
2918462SApril.Chin@Sun.COM					break
2928462SApril.Chin@Sun.COM				fi
2938462SApril.Chin@Sun.COM			fi
2948462SApril.Chin@Sun.COM		fi
2958462SApril.Chin@Sun.COM	done <"${input_file}"
2968462SApril.Chin@Sun.COM
2978462SApril.Chin@Sun.COM	return 0
2988462SApril.Chin@Sun.COM}
2998462SApril.Chin@Sun.COM
3008462SApril.Chin@Sun.COM
3018462SApril.Chin@Sun.COM# enumerate comments in files which are preprocessed by
3028462SApril.Chin@Sun.COM# CPP (e.g. C, C++, Imakefile etc.)
3038462SApril.Chin@Sun.COMfunction enumerate_comments_cpp
3048462SApril.Chin@Sun.COM{
3058462SApril.Chin@Sun.COM	set -o errexit
3068462SApril.Chin@Sun.COM#	set -o nounset
3078462SApril.Chin@Sun.COM
3088462SApril.Chin@Sun.COM	integer err=0
3098462SApril.Chin@Sun.COM
3108462SApril.Chin@Sun.COM	typeset input_file="$1"
3118462SApril.Chin@Sun.COM	nameref comment_array="$2"
3128462SApril.Chin@Sun.COM	integer max_num_comments="$3"
3138462SApril.Chin@Sun.COM	integer max_filesize_for_scan="$4"
3148462SApril.Chin@Sun.COM	integer ca=0 # index in "comment_array"
3158462SApril.Chin@Sun.COM
3168462SApril.Chin@Sun.COM	typeset content
3178462SApril.Chin@Sun.COM	integer content_length
3188462SApril.Chin@Sun.COM
3198462SApril.Chin@Sun.COM	integer file_pos # file position
320*10898Sroland.mainz@nrubsig.org	compound line_pos=(
3218462SApril.Chin@Sun.COM		integer x=0 # X position in line
3228462SApril.Chin@Sun.COM		integer y=0 # Y position in line (line number)
3238462SApril.Chin@Sun.COM	)
3248462SApril.Chin@Sun.COM	typeset c c2
3258462SApril.Chin@Sun.COM
3268462SApril.Chin@Sun.COM	typeset comment
3278462SApril.Chin@Sun.COM
328*10898Sroland.mainz@nrubsig.org	compound state=(
3298462SApril.Chin@Sun.COM		# C comment state
3308462SApril.Chin@Sun.COM		typeset in_c_comment=false
3318462SApril.Chin@Sun.COM		# C++ comment state
332*10898Sroland.mainz@nrubsig.org		compound cxx=(
3338462SApril.Chin@Sun.COM			typeset in_comment=false
3348462SApril.Chin@Sun.COM			typeset comment_continued=false
3358462SApril.Chin@Sun.COM			# position of current //-pos
336*10898Sroland.mainz@nrubsig.org			compound comment_pos=(
3378462SApril.Chin@Sun.COM				integer x=-1
3388462SApril.Chin@Sun.COM				integer y=-1
3398462SApril.Chin@Sun.COM			)
3408462SApril.Chin@Sun.COM			# position of previous //-pos
341*10898Sroland.mainz@nrubsig.org			compound comment_prev_pos=(
3428462SApril.Chin@Sun.COM				integer x=-1
3438462SApril.Chin@Sun.COM				integer y=-1
3448462SApril.Chin@Sun.COM			)
3458462SApril.Chin@Sun.COM		)
3468462SApril.Chin@Sun.COM		# literal state
3478462SApril.Chin@Sun.COM		typeset in_sq_literal=false # single-quote literal
3488462SApril.Chin@Sun.COM		typeset in_dq_literal=false # double-quote literal
3498462SApril.Chin@Sun.COM	)
3508462SApril.Chin@Sun.COM
3518462SApril.Chin@Sun.COM	content="$(< "${input_file}")"
3528462SApril.Chin@Sun.COM
3538462SApril.Chin@Sun.COM	# Truncate file to "max_filesize_for_scan" charatcters.
3548462SApril.Chin@Sun.COM	# This was originally added to work around a performance problem with
3558462SApril.Chin@Sun.COM	# the ${str:offset:chunksize} operator which scales badly in ksh93
3568462SApril.Chin@Sun.COM	# version 's' with the number of characters
3578462SApril.Chin@Sun.COM	if (( ${#content} > max_filesize_for_scan )) ; then
3588462SApril.Chin@Sun.COM		print -u2 -f "## WARNING: File '%s' truncated to %d characters\n" \
3598462SApril.Chin@Sun.COM			"${input_file}" \
3608462SApril.Chin@Sun.COM			max_filesize_for_scan
3618462SApril.Chin@Sun.COM		content="${content:0:max_filesize_for_scan}"
3628462SApril.Chin@Sun.COM	fi
3638462SApril.Chin@Sun.COM	content_length=${#content}
3648462SApril.Chin@Sun.COM
3658462SApril.Chin@Sun.COM	# Iterate through the source code. The last character
3668462SApril.Chin@Sun.COM	# (when file_pos == content_length) will be empty to indicate
3678462SApril.Chin@Sun.COM	# EOF (this is needed for cases like when
3688462SApril.Chin@Sun.COM	# a C++ comment is not terminated by a newline... ;-/)
3698462SApril.Chin@Sun.COM	for (( file_pos=0 ; file_pos <= content_length ; file_pos++ )) ; do
3708462SApril.Chin@Sun.COM		c2="${content:file_pos:2}"
3718462SApril.Chin@Sun.COM		c="${c2:0:1}"
3728462SApril.Chin@Sun.COM
3738462SApril.Chin@Sun.COM		if [[ "$c" == "${ch.newline}" ]] ; then
3748462SApril.Chin@Sun.COM			(( line_pos.x=0, line_pos.y++ ))
3758462SApril.Chin@Sun.COM		else
3768462SApril.Chin@Sun.COM			(( line_pos.x++ ))
3778462SApril.Chin@Sun.COM		fi
3788462SApril.Chin@Sun.COM
3798462SApril.Chin@Sun.COM		if ${state.in_c_comment} ; then
3808462SApril.Chin@Sun.COM			if [[ "$c2" == "*/" ]] ; then
3818462SApril.Chin@Sun.COM				(( file_pos++, line_pos.x++ ))
3828462SApril.Chin@Sun.COM				state.in_c_comment=false
3838462SApril.Chin@Sun.COM
3848462SApril.Chin@Sun.COM				# flush comment text
3858462SApril.Chin@Sun.COM				comment_array[ca++]="${comment}"
3868462SApril.Chin@Sun.COM				comment=""
3878462SApril.Chin@Sun.COM
3888462SApril.Chin@Sun.COM				if (( ca > max_num_comments )) ; then
3898462SApril.Chin@Sun.COM					break
3908462SApril.Chin@Sun.COM				fi
3918462SApril.Chin@Sun.COM			else
3928462SApril.Chin@Sun.COM				comment+="$c"
3938462SApril.Chin@Sun.COM			fi
3948462SApril.Chin@Sun.COM		elif ${state.cxx.in_comment} ; then
3958462SApril.Chin@Sun.COM			if [[ "$c" == "${ch.newline}" || "$c" == "" ]] ; then
3968462SApril.Chin@Sun.COM				state.cxx.in_comment=false
3978462SApril.Chin@Sun.COM
3988462SApril.Chin@Sun.COM				# flush comment text
3998462SApril.Chin@Sun.COM				if ${state.cxx.comment_continued} ; then
4008462SApril.Chin@Sun.COM					comment_array[ca-1]+="${ch.newline}${comment}"
4018462SApril.Chin@Sun.COM					(( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x ,
4028462SApril.Chin@Sun.COM					   state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y ))
4038462SApril.Chin@Sun.COM				else
4048462SApril.Chin@Sun.COM					comment_array[ca++]="${comment}"
4058462SApril.Chin@Sun.COM					(( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x ,
4068462SApril.Chin@Sun.COM					   state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y ))
4078462SApril.Chin@Sun.COM				fi
4088462SApril.Chin@Sun.COM				comment=""
4098462SApril.Chin@Sun.COM
4108462SApril.Chin@Sun.COM				if (( ca > max_num_comments )) ; then
4118462SApril.Chin@Sun.COM					break
4128462SApril.Chin@Sun.COM				fi
4138462SApril.Chin@Sun.COM			else
4148462SApril.Chin@Sun.COM				comment+="$c"
4158462SApril.Chin@Sun.COM			fi
4168462SApril.Chin@Sun.COM		elif ${state.in_sq_literal} ; then
4178462SApril.Chin@Sun.COM			if [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then
4188462SApril.Chin@Sun.COM				state.in_sq_literal=false
4198462SApril.Chin@Sun.COM			fi
4208462SApril.Chin@Sun.COM		elif ${state.in_dq_literal} ; then
4218462SApril.Chin@Sun.COM			if [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then
4228462SApril.Chin@Sun.COM				state.in_dq_literal=false
4238462SApril.Chin@Sun.COM			fi
4248462SApril.Chin@Sun.COM		else
4258462SApril.Chin@Sun.COM			if [[ "$c2" == "/*" ]] ; then
4268462SApril.Chin@Sun.COM				(( file_pos++, line_pos.x++ ))
4278462SApril.Chin@Sun.COM				state.in_c_comment=true
4288462SApril.Chin@Sun.COM				comment=""
4298462SApril.Chin@Sun.COM			elif [[ "$c2" == "//" ]] ; then
4308462SApril.Chin@Sun.COM				(( file_pos++, line_pos.x++ ))
4318462SApril.Chin@Sun.COM				if (( state.cxx.comment_prev_pos.x == line_pos.x && \
4328462SApril.Chin@Sun.COM					state.cxx.comment_prev_pos.y == (line_pos.y-1) )) ; then
4338462SApril.Chin@Sun.COM					state.cxx.comment_continued=true
4348462SApril.Chin@Sun.COM			else
4358462SApril.Chin@Sun.COM				state.cxx.comment_continued=false
4368462SApril.Chin@Sun.COM			fi
4378462SApril.Chin@Sun.COM			(( state.cxx.comment_pos.x=line_pos.x , state.cxx.comment_pos.y=line_pos.y ))
4388462SApril.Chin@Sun.COM			state.cxx.in_comment=true
4398462SApril.Chin@Sun.COM			comment=""
4408462SApril.Chin@Sun.COM			elif [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then
4418462SApril.Chin@Sun.COM				state.in_sq_literal=true
4428462SApril.Chin@Sun.COM			elif [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then
4438462SApril.Chin@Sun.COM				state.in_dq_literal=true
4448462SApril.Chin@Sun.COM			fi
4458462SApril.Chin@Sun.COM		fi
4468462SApril.Chin@Sun.COM	done
4478462SApril.Chin@Sun.COM
4488462SApril.Chin@Sun.COM	if [[ "$comment" != "" ]] ; then
4498462SApril.Chin@Sun.COM		print -u2 "## ERROR: Comment text buffer not empty at EOF."
4508462SApril.Chin@Sun.COM		err=1
4518462SApril.Chin@Sun.COM	fi
4528462SApril.Chin@Sun.COM
4538462SApril.Chin@Sun.COM	if ${state.in_c_comment} ; then
4548462SApril.Chin@Sun.COM		print -u2 "## ERROR: C comment did not close before EOF."
4558462SApril.Chin@Sun.COM		err=1
4568462SApril.Chin@Sun.COM	fi
4578462SApril.Chin@Sun.COM
4588462SApril.Chin@Sun.COM	if ${state.cxx.in_comment} ; then
4598462SApril.Chin@Sun.COM		print -u2 "## ERROR: C++ comment did not close before EOF."
4608462SApril.Chin@Sun.COM		err=1
4618462SApril.Chin@Sun.COM	fi
4628462SApril.Chin@Sun.COM
4638462SApril.Chin@Sun.COM	if ${state.in_dq_literal} ; then
4648462SApril.Chin@Sun.COM		print -u2 "## ERROR: Double-quoted literal did not close before EOF."
4658462SApril.Chin@Sun.COM		err=1
4668462SApril.Chin@Sun.COM	fi
4678462SApril.Chin@Sun.COM
4688462SApril.Chin@Sun.COM	# We treat this one only as warning since things like "foo.html.cpp" may
4698462SApril.Chin@Sun.COM	# trigger this condition accidently
4708462SApril.Chin@Sun.COM	if ${state.in_sq_literal} ; then
4718462SApril.Chin@Sun.COM		print -u2 "## WARNING: Single-quoted literal did not close before EOF."
4728462SApril.Chin@Sun.COM	fi
4738462SApril.Chin@Sun.COM
4748462SApril.Chin@Sun.COM	return $err
4758462SApril.Chin@Sun.COM}
4768462SApril.Chin@Sun.COM
4778462SApril.Chin@Sun.COM# determine file type
4788462SApril.Chin@Sun.COMfunction get_file_format
4798462SApril.Chin@Sun.COM{
4808462SApril.Chin@Sun.COM	set -o errexit
4818462SApril.Chin@Sun.COM
4828462SApril.Chin@Sun.COM	typeset filename="$1"
4838462SApril.Chin@Sun.COM	nameref file_format="$2"
4848462SApril.Chin@Sun.COM
4858462SApril.Chin@Sun.COM	typeset fileeval # evaluation result of /usr/bin/file
4868462SApril.Chin@Sun.COM
4878462SApril.Chin@Sun.COM	# check whether "filename" is a plain, readable file
4888462SApril.Chin@Sun.COM	[[ ! -f "$filename" ]] && return 1
4898462SApril.Chin@Sun.COM	[[ ! -r "$filename" ]] && return 1
4908462SApril.Chin@Sun.COM
4918462SApril.Chin@Sun.COM	# In theory this code would exclusively look at the contents of
4928462SApril.Chin@Sun.COM	# the file to figure out it's file format - unfortunately
4938462SApril.Chin@Sun.COM	# /usr/bin/file is virtually useless (the heuristics, matching
4948462SApril.Chin@Sun.COM	# and output unreliable) for many file formats and therefore
4958462SApril.Chin@Sun.COM	# we have to do a multi-stage approach which looks
4968462SApril.Chin@Sun.COM	# at the file's content if possible and at the filename
4978462SApril.Chin@Sun.COM	# otherwise. Fun... ;-(
4988462SApril.Chin@Sun.COM
4998462SApril.Chin@Sun.COM	# pass one: Find matches for file formats where /usr/bin/file
5008462SApril.Chin@Sun.COM	# is known to be unreliable:
5018462SApril.Chin@Sun.COM	case "$filename" in
5028462SApril.Chin@Sun.COM		*.[ch] | *.cpp | *.cc | *.cxx | *.hxx)
5038462SApril.Chin@Sun.COM			file_format="c_source"
5048462SApril.Chin@Sun.COM			return 0
5058462SApril.Chin@Sun.COM			;;
5068462SApril.Chin@Sun.COM		*Imakefile)
5078462SApril.Chin@Sun.COM			file_format="imakefile"
5088462SApril.Chin@Sun.COM			return 0
5098462SApril.Chin@Sun.COM			;;
5108462SApril.Chin@Sun.COM		*Makefile)
5118462SApril.Chin@Sun.COM			file_format="makefile"
5128462SApril.Chin@Sun.COM			return 0
5138462SApril.Chin@Sun.COM			;;
5148462SApril.Chin@Sun.COM	esac
5158462SApril.Chin@Sun.COM
5168462SApril.Chin@Sun.COM	# pass two: match by file content via /usr/bin/file
5178462SApril.Chin@Sun.COM	fileeval="$(LC_ALL=C /usr/bin/file "$filename")"
5188462SApril.Chin@Sun.COM	case "$fileeval" in
5198462SApril.Chin@Sun.COM		~(E)roff)
5208462SApril.Chin@Sun.COM			file_format="troff"
5218462SApril.Chin@Sun.COM			return 0
5228462SApril.Chin@Sun.COM			;;
5238462SApril.Chin@Sun.COM		~(E)html\ document)
5248462SApril.Chin@Sun.COM			file_format="html"
5258462SApril.Chin@Sun.COM			return 0
5268462SApril.Chin@Sun.COM			;;
5278462SApril.Chin@Sun.COM		~(E)sgml\ document)
5288462SApril.Chin@Sun.COM			file_format="sgml"
5298462SApril.Chin@Sun.COM			return 0
5308462SApril.Chin@Sun.COM			;;
5318462SApril.Chin@Sun.COM		~(E)executable.*(shell|(/|/r|/pf)(sh|ksh|ksh93|rksh93|dtksh|tksh|bash))\ script)
5328462SApril.Chin@Sun.COM			file_format="shell"
5338462SApril.Chin@Sun.COM			return 0
5348462SApril.Chin@Sun.COM			;;
5358462SApril.Chin@Sun.COM		~(E)executable.*/perl\ script)
5368462SApril.Chin@Sun.COM			file_format="perl"
5378462SApril.Chin@Sun.COM			return 0
5388462SApril.Chin@Sun.COM			;;
5398462SApril.Chin@Sun.COM	esac
5408462SApril.Chin@Sun.COM
5418462SApril.Chin@Sun.COM	# pass three: fallhack to filename matching
5428462SApril.Chin@Sun.COM	case "$filename" in
5438462SApril.Chin@Sun.COM		*.man)
5448462SApril.Chin@Sun.COM			file_format="troff"
5458462SApril.Chin@Sun.COM			return 0
5468462SApril.Chin@Sun.COM			;;
5478462SApril.Chin@Sun.COM		*.html)
5488462SApril.Chin@Sun.COM			file_format="html"
5498462SApril.Chin@Sun.COM			return 0
5508462SApril.Chin@Sun.COM			;;
5518462SApril.Chin@Sun.COM		*.sgml)
5528462SApril.Chin@Sun.COM			file_format="sgml"
5538462SApril.Chin@Sun.COM			return 0
5548462SApril.Chin@Sun.COM			;;
5558462SApril.Chin@Sun.COM		*.xml)
5568462SApril.Chin@Sun.COM			file_format="xml"
5578462SApril.Chin@Sun.COM			return 0
5588462SApril.Chin@Sun.COM			;;
5598462SApril.Chin@Sun.COM		*.png)
5608462SApril.Chin@Sun.COM			file_format="image_png"
5618462SApril.Chin@Sun.COM			return 0
5628462SApril.Chin@Sun.COM			;;
5638462SApril.Chin@Sun.COM		*.xcf)
5648462SApril.Chin@Sun.COM			file_format="image_xcf"
5658462SApril.Chin@Sun.COM			return 0
5668462SApril.Chin@Sun.COM			;;
5678462SApril.Chin@Sun.COM		*.shar)
5688462SApril.Chin@Sun.COM			file_format="archive_shell"
5698462SApril.Chin@Sun.COM			return 0
5708462SApril.Chin@Sun.COM			;;
5718462SApril.Chin@Sun.COM		*.sh)
5728462SApril.Chin@Sun.COM			file_format="shell"
5738462SApril.Chin@Sun.COM			return 0
5748462SApril.Chin@Sun.COM			;;
5758462SApril.Chin@Sun.COM		*.pcf)
5768462SApril.Chin@Sun.COM			file_format="font_pcf"
5778462SApril.Chin@Sun.COM			return 0
5788462SApril.Chin@Sun.COM			;;
5798462SApril.Chin@Sun.COM		*.bdf)
5808462SApril.Chin@Sun.COM			file_format="font_bdf"
5818462SApril.Chin@Sun.COM			return 0
5828462SApril.Chin@Sun.COM			;;
5838462SApril.Chin@Sun.COM		*.pmf)
5848462SApril.Chin@Sun.COM			file_format="font_pmf"
5858462SApril.Chin@Sun.COM			return 0
5868462SApril.Chin@Sun.COM			;;
5878462SApril.Chin@Sun.COM		*.ttf | *.otf)
5888462SApril.Chin@Sun.COM			file_format="font_ttf"
5898462SApril.Chin@Sun.COM			return 0
5908462SApril.Chin@Sun.COM			;;
5918462SApril.Chin@Sun.COM		*.pfa | *.pfb)
5928462SApril.Chin@Sun.COM			file_format="font_postscript"
5938462SApril.Chin@Sun.COM			return 0
5948462SApril.Chin@Sun.COM			;;
5958462SApril.Chin@Sun.COM	esac
5968462SApril.Chin@Sun.COM
5978462SApril.Chin@Sun.COM	return 1
5988462SApril.Chin@Sun.COM}
5998462SApril.Chin@Sun.COM
6008462SApril.Chin@Sun.COMfunction extract_comments
6018462SApril.Chin@Sun.COM{
6028462SApril.Chin@Sun.COM	set -o errexit
6038462SApril.Chin@Sun.COM
6048462SApril.Chin@Sun.COM	nameref records="$1"
6058462SApril.Chin@Sun.COM	typeset filename="$2"
6068462SApril.Chin@Sun.COM	integer max_num_comments="$3"
6078462SApril.Chin@Sun.COM	integer max_filesize_for_scan="$4"
6088462SApril.Chin@Sun.COM
6098462SApril.Chin@Sun.COM	typeset datatype=""
6108462SApril.Chin@Sun.COM
6118462SApril.Chin@Sun.COM	records[${filename}]=(
6128462SApril.Chin@Sun.COM		typeset filename="$filename"
6138462SApril.Chin@Sun.COM
6148462SApril.Chin@Sun.COM		typeset fileformat_found="false" # "true" or "false"
6158462SApril.Chin@Sun.COM		typeset file_format=""
6168462SApril.Chin@Sun.COM
6178462SApril.Chin@Sun.COM		typeset -A hashsum
6188462SApril.Chin@Sun.COM
6198462SApril.Chin@Sun.COM		typeset comments_parsed="false" # "true" or "false"
6208462SApril.Chin@Sun.COM		typeset -a comments
6218462SApril.Chin@Sun.COM	)
6228462SApril.Chin@Sun.COM
6238462SApril.Chin@Sun.COM	records[${filename}].hashsum["md5"]="$(sum  -x md5  < "$filename")"
6248462SApril.Chin@Sun.COM	records[${filename}].hashsum["sha1"]="$(sum -x sha1 < "$filename")"
6258462SApril.Chin@Sun.COM
6268462SApril.Chin@Sun.COM	if get_file_format "$filename" datatype ; then
6278462SApril.Chin@Sun.COM		records[${filename}].fileformat_found="true"
6288462SApril.Chin@Sun.COM		records[${filename}].file_format="$datatype"
6298462SApril.Chin@Sun.COM	else
6308462SApril.Chin@Sun.COM		return 1
6318462SApril.Chin@Sun.COM	fi
6328462SApril.Chin@Sun.COM
6338462SApril.Chin@Sun.COM	case "$datatype" in
6348462SApril.Chin@Sun.COM		c_source|imakefile)
6358462SApril.Chin@Sun.COM			enumerate_comments_cpp "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
6368462SApril.Chin@Sun.COM				records[${filename}].comments_parsed=true
6378462SApril.Chin@Sun.COM			;;
6388462SApril.Chin@Sun.COM		shell|makefile)
6398462SApril.Chin@Sun.COM			enumerate_comments_shell "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
6408462SApril.Chin@Sun.COM				records[${filename}].comments_parsed=true
6418462SApril.Chin@Sun.COM			;;
6428462SApril.Chin@Sun.COM		troff)
6438462SApril.Chin@Sun.COM			enumerate_comments_troff "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
6448462SApril.Chin@Sun.COM				records[${filename}].comments_parsed=true
6458462SApril.Chin@Sun.COM			;;
6468462SApril.Chin@Sun.COM		# NOTE: Disabled for now
6478462SApril.Chin@Sun.COM		#xml|html|sgml)
6488462SApril.Chin@Sun.COM		#	enumerate_comments_xml "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
6498462SApril.Chin@Sun.COM		#		records[${filename}].comments_parsed=true
6508462SApril.Chin@Sun.COM		#	;;
6518462SApril.Chin@Sun.COM	esac
6528462SApril.Chin@Sun.COM
6538462SApril.Chin@Sun.COM	return 0
6548462SApril.Chin@Sun.COM}
6558462SApril.Chin@Sun.COM
6568462SApril.Chin@Sun.COM# parse HTTP return code, cookies etc.
6578462SApril.Chin@Sun.COMfunction parse_http_response
6588462SApril.Chin@Sun.COM{
6598462SApril.Chin@Sun.COM	nameref response="$1"
6608462SApril.Chin@Sun.COM	typeset h statuscode statusmsg i
6618462SApril.Chin@Sun.COM
6628462SApril.Chin@Sun.COM	# we use '\r' as additional IFS to filter the final '\r'
6638462SApril.Chin@Sun.COM	IFS=$' \t\r' read -r h statuscode statusmsg  # read HTTP/1.[01] <code>
6648462SApril.Chin@Sun.COM	[[ "$h" != ~(Eil)HTTP/.* ]]         && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; }
6658462SApril.Chin@Sun.COM	[[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n"  "$0" ; return 1 ; }
6668462SApril.Chin@Sun.COM	response.statuscode="$statuscode"
6678462SApril.Chin@Sun.COM	response.statusmsg="$statusmsg"
6688462SApril.Chin@Sun.COM
6698462SApril.Chin@Sun.COM	# skip remaining headers
6708462SApril.Chin@Sun.COM	while IFS='' read -r i ; do
6718462SApril.Chin@Sun.COM		[[ "$i" == $'\r' ]] && break
6728462SApril.Chin@Sun.COM
6738462SApril.Chin@Sun.COM		# strip '\r' at the end
6748462SApril.Chin@Sun.COM		i="${i/~(Er)$'\r'/}"
6758462SApril.Chin@Sun.COM
6768462SApril.Chin@Sun.COM		case "$i" in
6778462SApril.Chin@Sun.COM			~(Eli)Content-Type:.*)
6788462SApril.Chin@Sun.COM				response.content_type="${i/~(El).*:[[:blank:]]*/}"
6798462SApril.Chin@Sun.COM				;;
6808462SApril.Chin@Sun.COM			~(Eli)Content-Length:[[:blank:]]*[0-9]*)
6818462SApril.Chin@Sun.COM				integer response.content_length="${i/~(El).*:[[:blank:]]*/}"
6828462SApril.Chin@Sun.COM				;;
6838462SApril.Chin@Sun.COM			~(Eli)Transfer-Encoding:.*)
6848462SApril.Chin@Sun.COM				response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}"
6858462SApril.Chin@Sun.COM				;;
6868462SApril.Chin@Sun.COM		esac
6878462SApril.Chin@Sun.COM	done
6888462SApril.Chin@Sun.COM
6898462SApril.Chin@Sun.COM	return 0
6908462SApril.Chin@Sun.COM}
6918462SApril.Chin@Sun.COM
6928462SApril.Chin@Sun.COMfunction cat_http_body
6938462SApril.Chin@Sun.COM{
6948462SApril.Chin@Sun.COM	typeset emode="$1"
6958462SApril.Chin@Sun.COM	typeset hexchunksize="0"
6968462SApril.Chin@Sun.COM	integer chunksize=0
6978462SApril.Chin@Sun.COM
6988462SApril.Chin@Sun.COM	if [[ "${emode}" == "chunked" ]] ; then
6998462SApril.Chin@Sun.COM		while IFS=$'\r' read hexchunksize &&
7008462SApril.Chin@Sun.COM			[[ "${hexchunksize}" == ~(Elri)[0-9abcdef]* ]] &&
7018462SApril.Chin@Sun.COM			(( chunksize=16#${hexchunksize} )) && (( chunksize > 0 )) ; do
7028462SApril.Chin@Sun.COM			dd bs=1 count="${chunksize}" 2>/dev/null
7038462SApril.Chin@Sun.COM		done
7048462SApril.Chin@Sun.COM	else
7058462SApril.Chin@Sun.COM		cat
7068462SApril.Chin@Sun.COM	fi
7078462SApril.Chin@Sun.COM
7088462SApril.Chin@Sun.COM	return 0
7098462SApril.Chin@Sun.COM}
7108462SApril.Chin@Sun.COM
711*10898Sroland.mainz@nrubsig.orgfunction cat_url
7128462SApril.Chin@Sun.COM{
7138462SApril.Chin@Sun.COM	typeset protocol="${1%://*}"
7148462SApril.Chin@Sun.COM	typeset path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html"
715*10898Sroland.mainz@nrubsig.org
716*10898Sroland.mainz@nrubsig.org	if [[ "${protocol}" == "file" ]] ; then
717*10898Sroland.mainz@nrubsig.org		cat "${path1}"
718*10898Sroland.mainz@nrubsig.org		return $?
719*10898Sroland.mainz@nrubsig.org	elif [[ "${protocol}" == ~(Elr)http(|s) ]] ; then
720*10898Sroland.mainz@nrubsig.org		typeset host="${path1%%/*}"
721*10898Sroland.mainz@nrubsig.org		typeset path="${path1#*/}"
722*10898Sroland.mainz@nrubsig.org		typeset port="${host##*:}"
7238462SApril.Chin@Sun.COM
724*10898Sroland.mainz@nrubsig.org		integer netfd
725*10898Sroland.mainz@nrubsig.org		compound httpresponse # http response
7268462SApril.Chin@Sun.COM
727*10898Sroland.mainz@nrubsig.org		# If URL did not contain a port number in the host part then look at the
728*10898Sroland.mainz@nrubsig.org		# protocol to get the port number
729*10898Sroland.mainz@nrubsig.org		if [[ "${port}" == "${host}" ]] ; then
730*10898Sroland.mainz@nrubsig.org			case "${protocol}" in
731*10898Sroland.mainz@nrubsig.org				"http")  port=80 ;;
732*10898Sroland.mainz@nrubsig.org				"https") port=443 ;;
733*10898Sroland.mainz@nrubsig.org				*)       port="$(getent services "${protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;;
734*10898Sroland.mainz@nrubsig.org			esac
735*10898Sroland.mainz@nrubsig.org		else
736*10898Sroland.mainz@nrubsig.org			host="${host%:*}"
737*10898Sroland.mainz@nrubsig.org		fi
7388462SApril.Chin@Sun.COM
739*10898Sroland.mainz@nrubsig.org		printmsg "protocol=${protocol} port=${port} host=${host} path=${path}"
7408462SApril.Chin@Sun.COM
741*10898Sroland.mainz@nrubsig.org		# prechecks
742*10898Sroland.mainz@nrubsig.org		[[ "${protocol}" != "" ]] || { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; }
743*10898Sroland.mainz@nrubsig.org		[[ "${port}"     != "" ]] || { print -u2 -f "%s: port not set.\n"     "$0" ; return 1 ; }
744*10898Sroland.mainz@nrubsig.org		[[ "${host}"     != "" ]] || { print -u2 -f "%s: host not set.\n"     "$0" ; return 1 ; }
745*10898Sroland.mainz@nrubsig.org		[[ "${path}"     != "" ]] || { print -u2 -f "%s: path not set.\n"     "$0" ; return 1 ; }
7468462SApril.Chin@Sun.COM
747*10898Sroland.mainz@nrubsig.org		# open TCP channel
748*10898Sroland.mainz@nrubsig.org		if [[ "${protocol}" == "https" ]] ; then
749*10898Sroland.mainz@nrubsig.org			compound sslfifo
750*10898Sroland.mainz@nrubsig.org			sslfifo.dir="$(mktemp -d)"
751*10898Sroland.mainz@nrubsig.org			sslfifo.in="${sslfifo.dir}/in"
752*10898Sroland.mainz@nrubsig.org			sslfifo.out="${sslfifo.dir}/out"
753*10898Sroland.mainz@nrubsig.org
754*10898Sroland.mainz@nrubsig.org			# register an EXIT trap and use "errexit" to leave it at the first error
755*10898Sroland.mainz@nrubsig.org			# (this saves lots of if/fi tests for error checking)
756*10898Sroland.mainz@nrubsig.org			trap "rm -r \"${sslfifo.dir}\"" EXIT
757*10898Sroland.mainz@nrubsig.org			set -o errexit
758*10898Sroland.mainz@nrubsig.org
759*10898Sroland.mainz@nrubsig.org			mkfifo "${sslfifo.in}" "${sslfifo.out}"
760*10898Sroland.mainz@nrubsig.org
761*10898Sroland.mainz@nrubsig.org			# create async openssl child to handle https
762*10898Sroland.mainz@nrubsig.org			openssl s_client -quiet -connect "${host}:${port}" <"${sslfifo.in}" >>"${sslfifo.out}" &
7638462SApril.Chin@Sun.COM
764*10898Sroland.mainz@nrubsig.org			# send HTTP request
765*10898Sroland.mainz@nrubsig.org			request="GET /${path} HTTP/1.1\r\n"
766*10898Sroland.mainz@nrubsig.org			request+="Host: ${host}\r\n"
767*10898Sroland.mainz@nrubsig.org			request+="User-Agent: crawlsrccomments/ksh93(ssl) (2009-05-08; $(uname -s -r -p))\r\n"
768*10898Sroland.mainz@nrubsig.org			request+="Connection: close\r\n"
769*10898Sroland.mainz@nrubsig.org			print -n -- "${request}\r\n" >>	"${sslfifo.in}"
770*10898Sroland.mainz@nrubsig.org
771*10898Sroland.mainz@nrubsig.org			# collect response and send it to stdout
772*10898Sroland.mainz@nrubsig.org			{
773*10898Sroland.mainz@nrubsig.org				parse_http_response httpresponse
774*10898Sroland.mainz@nrubsig.org				cat_http_body "${httpresponse.transfer_encoding}"
775*10898Sroland.mainz@nrubsig.org			} <"${sslfifo.out}"
776*10898Sroland.mainz@nrubsig.org
777*10898Sroland.mainz@nrubsig.org			wait || { print -u2 -f "%s: openssl failed.\n" ; exit 1 ; }
778*10898Sroland.mainz@nrubsig.org
779*10898Sroland.mainz@nrubsig.org			return 0
780*10898Sroland.mainz@nrubsig.org		else
781*10898Sroland.mainz@nrubsig.org			redirect {netfd}<> "/dev/tcp/${host}/${port}"
782*10898Sroland.mainz@nrubsig.org			(( $? != 0 )) && { print -u2 -f "%s: Could not open %s\n" "$0" "${1}" ; return 1 ; }
783*10898Sroland.mainz@nrubsig.org
784*10898Sroland.mainz@nrubsig.org			# send HTTP request
785*10898Sroland.mainz@nrubsig.org			request="GET /${path} HTTP/1.1\r\n"
786*10898Sroland.mainz@nrubsig.org			request+="Host: ${host}\r\n"
787*10898Sroland.mainz@nrubsig.org			request+="User-Agent: crawlsrccomments/ksh93 (2009-05-08; $(uname -s -r -p))\r\n"
788*10898Sroland.mainz@nrubsig.org			request+="Connection: close\r\n"
789*10898Sroland.mainz@nrubsig.org			print -n -- "${request}\r\n" >&${netfd}
7908462SApril.Chin@Sun.COM
791*10898Sroland.mainz@nrubsig.org			# collect response and send it to stdout
792*10898Sroland.mainz@nrubsig.org			parse_http_response httpresponse <&${netfd}
793*10898Sroland.mainz@nrubsig.org			cat_http_body "${httpresponse.transfer_encoding}" <&${netfd}
7948462SApril.Chin@Sun.COM
795*10898Sroland.mainz@nrubsig.org			# close connection
796*10898Sroland.mainz@nrubsig.org			redirect {netfd}<&-
797*10898Sroland.mainz@nrubsig.org
798*10898Sroland.mainz@nrubsig.org			return 0
799*10898Sroland.mainz@nrubsig.org		fi
800*10898Sroland.mainz@nrubsig.org	else
801*10898Sroland.mainz@nrubsig.org		return 1
802*10898Sroland.mainz@nrubsig.org	fi
803*10898Sroland.mainz@nrubsig.org	# notreached
8048462SApril.Chin@Sun.COM}
8058462SApril.Chin@Sun.COM
8068462SApril.Chin@Sun.COMfunction print_stats
8078462SApril.Chin@Sun.COM{
8088462SApril.Chin@Sun.COM	set -o errexit
8098462SApril.Chin@Sun.COM
8108462SApril.Chin@Sun.COM	# gather some statistics
811*10898Sroland.mainz@nrubsig.org	compound stats=(
8128462SApril.Chin@Sun.COM		integer files_with_comments=0
8138462SApril.Chin@Sun.COM		integer files_without_comments=0
8148462SApril.Chin@Sun.COM
8158462SApril.Chin@Sun.COM		integer files_without_known_format=0
8168462SApril.Chin@Sun.COM
8178462SApril.Chin@Sun.COM		integer files_with_license_info=0
8188462SApril.Chin@Sun.COM		integer files_without_license_info=0
8198462SApril.Chin@Sun.COM
8208462SApril.Chin@Sun.COM		integer total_num_files=0
8218462SApril.Chin@Sun.COM	)
8228462SApril.Chin@Sun.COM
8238462SApril.Chin@Sun.COM	for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
8248462SApril.Chin@Sun.COM		if "${records[$i].comments_parsed}" ; then
8258462SApril.Chin@Sun.COM			(( stats.files_with_comments++ ))
8268462SApril.Chin@Sun.COM		else
8278462SApril.Chin@Sun.COM			(( stats.files_without_comments++ ))
8288462SApril.Chin@Sun.COM		fi
8298462SApril.Chin@Sun.COM
8308462SApril.Chin@Sun.COM		if ! "${records[$i].fileformat_found}" ; then
8318462SApril.Chin@Sun.COM			(( stats.files_without_known_format++ ))
8328462SApril.Chin@Sun.COM		fi
8338462SApril.Chin@Sun.COM
8348462SApril.Chin@Sun.COM		if "${records[$i].license_info_found}" ; then
8358462SApril.Chin@Sun.COM			(( stats.files_with_license_info++ ))
8368462SApril.Chin@Sun.COM		else
8378462SApril.Chin@Sun.COM			(( stats.files_without_license_info++ ))
8388462SApril.Chin@Sun.COM		fi
8398462SApril.Chin@Sun.COM
8408462SApril.Chin@Sun.COM		(( stats.total_num_files++ ))
8418462SApril.Chin@Sun.COM	done
8428462SApril.Chin@Sun.COM
843*10898Sroland.mainz@nrubsig.org	print -v stats
8448462SApril.Chin@Sun.COM	return 0
8458462SApril.Chin@Sun.COM}
8468462SApril.Chin@Sun.COM
8478462SApril.Chin@Sun.COM
8488462SApril.Chin@Sun.COMfunction print_comments_plain
8498462SApril.Chin@Sun.COM{
8508462SApril.Chin@Sun.COM	set -o errexit
8518462SApril.Chin@Sun.COM
8528462SApril.Chin@Sun.COM	nameref records=$1
8538462SApril.Chin@Sun.COM	nameref options=$2
8548462SApril.Chin@Sun.COM	typeset i j
8558462SApril.Chin@Sun.COM
8568462SApril.Chin@Sun.COM	for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
8578462SApril.Chin@Sun.COM		nameref node=records[$i]
8588462SApril.Chin@Sun.COM
8598462SApril.Chin@Sun.COM		if [[ "${options.filepattern.accept}" != "" ]] && \
8608462SApril.Chin@Sun.COM		   [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then
8618462SApril.Chin@Sun.COM			continue
8628462SApril.Chin@Sun.COM		fi
8638462SApril.Chin@Sun.COM		if [[ "${options.filepattern.reject}" != "" ]] && \
8648462SApril.Chin@Sun.COM		   [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then
8658462SApril.Chin@Sun.COM			continue
8668462SApril.Chin@Sun.COM		fi
8678462SApril.Chin@Sun.COM
8688462SApril.Chin@Sun.COM		node.license_info_found=false
8698462SApril.Chin@Sun.COM
8708462SApril.Chin@Sun.COM		if ! "${node.comments_parsed}" ; then
8718462SApril.Chin@Sun.COM			continue
8728462SApril.Chin@Sun.COM		fi
8738462SApril.Chin@Sun.COM
8748462SApril.Chin@Sun.COM		for j in "${!node.comments[@]}" ; do
8758462SApril.Chin@Sun.COM			typeset s="${node.comments[$j]}"
8768462SApril.Chin@Sun.COM			typeset match=false
8778462SApril.Chin@Sun.COM
8788462SApril.Chin@Sun.COM			if [[ "${options.commentpattern.accept}" != "" ]] && \
8798462SApril.Chin@Sun.COM		   	   [[ "$s" == ${options.commentpattern.accept} ]] ; then
8808462SApril.Chin@Sun.COM				match=true
8818462SApril.Chin@Sun.COM			fi
8828462SApril.Chin@Sun.COM			if [[ "${options.commentpattern.reject}" != "" ]] && \
8838462SApril.Chin@Sun.COM	  		   [[ "$s" == ${options.commentpattern.reject} ]] ; then
8848462SApril.Chin@Sun.COM				match=false
8858462SApril.Chin@Sun.COM			fi
8868462SApril.Chin@Sun.COM
8878462SApril.Chin@Sun.COM			if "${match}" ; then
8888462SApril.Chin@Sun.COM				printf "\f#### filename='%s',\tcomment=%s\n" "${node.filename}" "$j"
8898462SApril.Chin@Sun.COM				printf "%s\n" "$s"
8908462SApril.Chin@Sun.COM				node.license_info_found=true
8918462SApril.Chin@Sun.COM			fi
8928462SApril.Chin@Sun.COM		done
8938462SApril.Chin@Sun.COM
8948462SApril.Chin@Sun.COM		if ! "${node.license_info_found}" ; then
8958462SApril.Chin@Sun.COM			printf "## no match found in '%s'," "${node.filename}"
8968462SApril.Chin@Sun.COM			printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \
8978462SApril.Chin@Sun.COM				"${node.comments_parsed}" \
8988462SApril.Chin@Sun.COM				"${node.fileformat_found}" \
8998462SApril.Chin@Sun.COM				"${node.file_format}"
9008462SApril.Chin@Sun.COM		fi
9018462SApril.Chin@Sun.COM	done
9028462SApril.Chin@Sun.COM
9038462SApril.Chin@Sun.COM	return 0
9048462SApril.Chin@Sun.COM}
9058462SApril.Chin@Sun.COM
9068462SApril.Chin@Sun.COMfunction print_comments_duplicates_compressed
9078462SApril.Chin@Sun.COM{
9088462SApril.Chin@Sun.COM	set -o errexit
9098462SApril.Chin@Sun.COM
9108462SApril.Chin@Sun.COM	nameref records=$1
9118462SApril.Chin@Sun.COM	nameref options=$2
9128462SApril.Chin@Sun.COM	typeset i j
9138462SApril.Chin@Sun.COM	typeset -A hashed_comments
9148462SApril.Chin@Sun.COM	integer num_hashed_comments
9158462SApril.Chin@Sun.COM
9168462SApril.Chin@Sun.COM	for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
9178462SApril.Chin@Sun.COM		nameref node=records[$i]
9188462SApril.Chin@Sun.COM
9198462SApril.Chin@Sun.COM		if [[ "${options.filepattern.accept}" != "" ]] && \
9208462SApril.Chin@Sun.COM		   [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then
9218462SApril.Chin@Sun.COM			continue
9228462SApril.Chin@Sun.COM		fi
9238462SApril.Chin@Sun.COM		if [[ "${options.filepattern.reject}" != "" ]] && \
9248462SApril.Chin@Sun.COM		   [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then
9258462SApril.Chin@Sun.COM			continue
9268462SApril.Chin@Sun.COM		fi
9278462SApril.Chin@Sun.COM
9288462SApril.Chin@Sun.COM		node.license_info_found=false
9298462SApril.Chin@Sun.COM
9308462SApril.Chin@Sun.COM		if ! "${node.comments_parsed}" ; then
9318462SApril.Chin@Sun.COM			continue
9328462SApril.Chin@Sun.COM		fi
9338462SApril.Chin@Sun.COM
9348462SApril.Chin@Sun.COM		for j in "${!node.comments[@]}" ; do
9358462SApril.Chin@Sun.COM			typeset s="${node.comments[$j]}"
9368462SApril.Chin@Sun.COM			typeset match=false
9378462SApril.Chin@Sun.COM
9388462SApril.Chin@Sun.COM			if [[ "${options.commentpattern.accept}" != "" ]] && \
9398462SApril.Chin@Sun.COM		   	   [[ "$s" == ${options.commentpattern.accept} ]] ; then
9408462SApril.Chin@Sun.COM				match=true
9418462SApril.Chin@Sun.COM			fi
9428462SApril.Chin@Sun.COM			if [[ "${options.commentpattern.reject}" != "" ]] && \
9438462SApril.Chin@Sun.COM	  		   [[ "$s" == ${options.commentpattern.reject} ]] ; then
9448462SApril.Chin@Sun.COM				match=false
9458462SApril.Chin@Sun.COM			fi
9468462SApril.Chin@Sun.COM
9478462SApril.Chin@Sun.COM
9488462SApril.Chin@Sun.COM			if "${match}" ; then
9498462SApril.Chin@Sun.COM				typeset -l hashstring # lowercase
9508462SApril.Chin@Sun.COM
9518462SApril.Chin@Sun.COM				# compress the comment (e.g. convert whiteapces and '.,:;()"' to newline characters) ...
9528462SApril.Chin@Sun.COM				hashstring="${s//+([\n\r\t\v*#.,:;\(\)\"[:space:][:blank:]])/${ch.newline}}"
9538462SApril.Chin@Sun.COM				# ... and then create a MD5 hash from this string
9548462SApril.Chin@Sun.COM				hash="$(sum -x md5 <<<"${hashstring}")"
9558462SApril.Chin@Sun.COM
9568462SApril.Chin@Sun.COM				nameref hc_node=hashed_comments[${hash}]
9578462SApril.Chin@Sun.COM
9588462SApril.Chin@Sun.COM				if [[ "${hc_node}" == "" ]] ; then
9598462SApril.Chin@Sun.COM					# build node if there isn't one yet
9608462SApril.Chin@Sun.COM					typeset -a hc_node.fileids
9618462SApril.Chin@Sun.COM					typeset    hc_node.comment="$s"
9628462SApril.Chin@Sun.COM				fi
9638462SApril.Chin@Sun.COM
9648462SApril.Chin@Sun.COM				hc_node.fileids+=( "$(printf "%s (md5='%s', sha1='%s')\n" "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}")" )
9658462SApril.Chin@Sun.COM
9668462SApril.Chin@Sun.COM				node.license_info_found=true
9678462SApril.Chin@Sun.COM			fi
9688462SApril.Chin@Sun.COM		done
9698462SApril.Chin@Sun.COM
9708462SApril.Chin@Sun.COM		if ! "${node.license_info_found}" ; then
9718462SApril.Chin@Sun.COM			printf "## no match found in "
9728462SApril.Chin@Sun.COM			printf "%s (md5='%s', sha1='%s'), " "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}"
9738462SApril.Chin@Sun.COM			printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \
9748462SApril.Chin@Sun.COM				"${node.comments_parsed}" \
9758462SApril.Chin@Sun.COM				"${node.fileformat_found}" \
9768462SApril.Chin@Sun.COM				"${node.file_format}"
9778462SApril.Chin@Sun.COM		fi
9788462SApril.Chin@Sun.COM	done
9798462SApril.Chin@Sun.COM
9808462SApril.Chin@Sun.COM	# print comments and all fileids (filename+hash sums) which include this comment
9818462SApril.Chin@Sun.COM	for i in "${!hashed_comments[@]}" ; do
9828462SApril.Chin@Sun.COM		printf "\f## The comment (ID=%s) ..." "${i}"
9838462SApril.Chin@Sun.COM		printf "\n-- snip --"
9848462SApril.Chin@Sun.COM		printf "\n%s" "${hashed_comments[${i}].comment}"
9858462SApril.Chin@Sun.COM		printf "\n-- snip --"
9868462SApril.Chin@Sun.COM		printf "\n... applies to the following files:\n"
9878462SApril.Chin@Sun.COM		printf "\t%s\n" "${hashed_comments[${i}].fileids[@]}" # printf repeats the format string for each array memeber
9888462SApril.Chin@Sun.COM	done
9898462SApril.Chin@Sun.COM
9908462SApril.Chin@Sun.COM	return 0
9918462SApril.Chin@Sun.COM}
9928462SApril.Chin@Sun.COM
9938462SApril.Chin@Sun.COMfunction do_crawl
9948462SApril.Chin@Sun.COM{
9958462SApril.Chin@Sun.COM	set -o errexit
9968462SApril.Chin@Sun.COM
997*10898Sroland.mainz@nrubsig.org	compound options=(
9988462SApril.Chin@Sun.COM		integer max_filesize_for_scan=$((256*1024))
9998462SApril.Chin@Sun.COM		integer max_num_comments=$((2**62)) # FIXME: This should be "+Inf" (=Infinite)
10008462SApril.Chin@Sun.COM	)
10018462SApril.Chin@Sun.COM
10028462SApril.Chin@Sun.COM	shift
10038462SApril.Chin@Sun.COM	while getopts -a "${progname}" "${do_crawl_usage}" OPT "$@" ; do
10048462SApril.Chin@Sun.COM		printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
10058462SApril.Chin@Sun.COM		case ${OPT} in
10068462SApril.Chin@Sun.COM			S)	options.max_filesize_for_scan="${OPTARG}"  ;;
10078462SApril.Chin@Sun.COM			N)	options.max_num_comments="${OPTARG}"  ;;
10088462SApril.Chin@Sun.COM			*)	usage do_crawl_usage ;;
10098462SApril.Chin@Sun.COM		esac
10108462SApril.Chin@Sun.COM	done
10118462SApril.Chin@Sun.COM	shift $((OPTIND-1))
10128462SApril.Chin@Sun.COM
1013*10898Sroland.mainz@nrubsig.org	compound scan=(
10148462SApril.Chin@Sun.COM		typeset -A records
10158462SApril.Chin@Sun.COM	)
10168462SApril.Chin@Sun.COM
10178462SApril.Chin@Sun.COM	# read filenames from stdin
10188462SApril.Chin@Sun.COM	while read i ; do
10198462SApril.Chin@Sun.COM		printf "## scanning %s ...\n" "$i"
10208462SApril.Chin@Sun.COM		extract_comments scan.records "$i" ${options.max_num_comments} ${options.max_filesize_for_scan} || true
10218462SApril.Chin@Sun.COM	done
10228462SApril.Chin@Sun.COM
10238462SApril.Chin@Sun.COM	# print compound variable array (we strip the "typeset -A records" for now)
1024*10898Sroland.mainz@nrubsig.org	print -v scan >"crawlsrccomments_extracted_comments.cpv"
10258462SApril.Chin@Sun.COM
10268462SApril.Chin@Sun.COM	print "# Wrote results to crawlsrccomments_extracted_comments.cpv"
10278462SApril.Chin@Sun.COM
10288462SApril.Chin@Sun.COM	return 0
10298462SApril.Chin@Sun.COM}
10308462SApril.Chin@Sun.COM
10318462SApril.Chin@Sun.COMfunction do_getcomments
10328462SApril.Chin@Sun.COM{
10338462SApril.Chin@Sun.COM	set -o errexit
10348462SApril.Chin@Sun.COM
10358462SApril.Chin@Sun.COM	# vars
1036*10898Sroland.mainz@nrubsig.org	compound scan
10378462SApril.Chin@Sun.COM	typeset database
10388462SApril.Chin@Sun.COM	typeset tmp
10398462SApril.Chin@Sun.COM
1040*10898Sroland.mainz@nrubsig.org	compound options=(
10418462SApril.Chin@Sun.COM		typeset database="crawlsrccomments_extracted_comments.cpv"
10428462SApril.Chin@Sun.COM
10438462SApril.Chin@Sun.COM		typeset print_stats=false
10448462SApril.Chin@Sun.COM		typeset zapduplicates=false
1045*10898Sroland.mainz@nrubsig.org		compound filepattern=(
10468462SApril.Chin@Sun.COM			typeset accept="*"
10478462SApril.Chin@Sun.COM			typeset reject=""
10488462SApril.Chin@Sun.COM		)
1049*10898Sroland.mainz@nrubsig.org		compound commentpattern=(
10508462SApril.Chin@Sun.COM			typeset accept="~(Ei)(license|copyright)"
10518462SApril.Chin@Sun.COM			typeset reject=""
10528462SApril.Chin@Sun.COM		)
10538462SApril.Chin@Sun.COM	)
10548462SApril.Chin@Sun.COM
10558462SApril.Chin@Sun.COM	shift
10568462SApril.Chin@Sun.COM	while getopts -a "${progname}" "${do_getcomments_usage}" OPT "$@" ; do
10578462SApril.Chin@Sun.COM	#    printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
10588462SApril.Chin@Sun.COM		case ${OPT} in
10598462SApril.Chin@Sun.COM			c)	options.commentpattern.accept="${OPTARG}" ;;
10608462SApril.Chin@Sun.COM			C)	options.commentpattern.reject="${OPTARG}" ;;
10618462SApril.Chin@Sun.COM			D)	options.database="${OPTARG}" ;;
10628462SApril.Chin@Sun.COM			l)	options.filepattern.accept="${OPTARG}" ;;
10638462SApril.Chin@Sun.COM			L)	options.filepattern.reject="${OPTARG}" ;;
10648462SApril.Chin@Sun.COM			S)	options.print_stats=true ;;
10658462SApril.Chin@Sun.COM			+S)	options.print_stats=false ;;
10668462SApril.Chin@Sun.COM			Z)	options.zapduplicates=true ;;
10678462SApril.Chin@Sun.COM			+Z)	options.zapduplicates=false ;;
10688462SApril.Chin@Sun.COM			*)	usage do_getcomments_usage ;;
10698462SApril.Chin@Sun.COM		esac
10708462SApril.Chin@Sun.COM	done
10718462SApril.Chin@Sun.COM	shift $((OPTIND-1))
10728462SApril.Chin@Sun.COM
10738462SApril.Chin@Sun.COM	# array of temporary files which should be cleaned-up upon exit
10748462SApril.Chin@Sun.COM	typeset -a tmpfiles
10758462SApril.Chin@Sun.COM	trap 'set -o errexit ; print -u2 "# Cleaning up..." ; ((${#tmpfiles[@]} > 0)) && rm -- "${tmpfiles[@]}" ; print -u2 "# Done."' EXIT
10768462SApril.Chin@Sun.COM
10778462SApril.Chin@Sun.COM	# Support for HTTP URLs
1078*10898Sroland.mainz@nrubsig.org	if [[ "${options.database}" == ~(El)(http|https)://.* ]] ; then
1079*10898Sroland.mainz@nrubsig.org		database="/tmp/extract_license_cat_url_${PPID}_$$.tmp"
10808462SApril.Chin@Sun.COM		tmpfiles+=( "${database}" )
10818462SApril.Chin@Sun.COM		print -u2 "# Loading URL..."
1082*10898Sroland.mainz@nrubsig.org		cat_url "${options.database}" >"${database}"
10838462SApril.Chin@Sun.COM		print -u2 "# Loading URL done."
10848462SApril.Chin@Sun.COM	else
10858462SApril.Chin@Sun.COM		database="${options.database}"
10868462SApril.Chin@Sun.COM	fi
10878462SApril.Chin@Sun.COM
10888462SApril.Chin@Sun.COM	if [[ ! -r "${database}" ]] ; then
10898462SApril.Chin@Sun.COM		fatal_error "Can't read ${database}."
10908462SApril.Chin@Sun.COM	fi
10918462SApril.Chin@Sun.COM
10928462SApril.Chin@Sun.COM	# Support for compressed database files
10938462SApril.Chin@Sun.COM	case "$(LC_ALL=C /usr/bin/file "${database}")" in
10948462SApril.Chin@Sun.COM		*bzip2*)
10958462SApril.Chin@Sun.COM			tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp"
10968462SApril.Chin@Sun.COM			tmpfiles+=( "${tmp}" )
10978462SApril.Chin@Sun.COM			print -u2 "# Uncompressing data (bzip2) ..."
10988462SApril.Chin@Sun.COM			bzcat <"${database}" >"${tmp}"
10998462SApril.Chin@Sun.COM			print -u2 "# Uncompression done."
11008462SApril.Chin@Sun.COM			database="${tmp}"
11018462SApril.Chin@Sun.COM			;;
11028462SApril.Chin@Sun.COM		*gzip*)
11038462SApril.Chin@Sun.COM			tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp"
11048462SApril.Chin@Sun.COM			tmpfiles+=( "${tmp}" )
11058462SApril.Chin@Sun.COM			print -u2 "# Uncompressing data (gzip) ..."
11068462SApril.Chin@Sun.COM			gunzip -c <"${database}" >"${tmp}"
11078462SApril.Chin@Sun.COM			print -u2 "# Uncompression done."
11088462SApril.Chin@Sun.COM			database="${tmp}"
11098462SApril.Chin@Sun.COM			;;
11108462SApril.Chin@Sun.COM	esac
11118462SApril.Chin@Sun.COM
11128462SApril.Chin@Sun.COM	# Read compound variable which contain all recorded comments
11138462SApril.Chin@Sun.COM	print -u2 "# reading records..."
1114*10898Sroland.mainz@nrubsig.org	read -C scan <"${database}" || fatal_error 'Error reading data.'
11158462SApril.Chin@Sun.COM	print -u2 -f "# reading %d records done.\n" "${#scan.records[@]}"
11168462SApril.Chin@Sun.COM
11178462SApril.Chin@Sun.COM	# print comments
11188462SApril.Chin@Sun.COM	print -u2 "# processing data..."
11198462SApril.Chin@Sun.COM	print "## comments start:"
11208462SApril.Chin@Sun.COM	if "${options.zapduplicates}" ; then
11218462SApril.Chin@Sun.COM		print_comments_duplicates_compressed scan.records options
11228462SApril.Chin@Sun.COM	else
11238462SApril.Chin@Sun.COM		print_comments_plain scan.records options
11248462SApril.Chin@Sun.COM	fi
11258462SApril.Chin@Sun.COM	print "## comments end"
11268462SApril.Chin@Sun.COM	print -u2 "# processing data done."
11278462SApril.Chin@Sun.COM
11288462SApril.Chin@Sun.COM	if "${options.print_stats}" ; then
11298462SApril.Chin@Sun.COM		print_stats
11308462SApril.Chin@Sun.COM	fi
11318462SApril.Chin@Sun.COM
11328462SApril.Chin@Sun.COM	return 0
11338462SApril.Chin@Sun.COM}
11348462SApril.Chin@Sun.COM
11358462SApril.Chin@Sun.COMfunction usage
11368462SApril.Chin@Sun.COM{
11378462SApril.Chin@Sun.COM	nameref usagemsg=$1
11388462SApril.Chin@Sun.COM	OPTIND=0
11398462SApril.Chin@Sun.COM	getopts -a "${progname}" "${usagemsg}" OPT '-?'
11408462SApril.Chin@Sun.COM	exit 2
11418462SApril.Chin@Sun.COM}
11428462SApril.Chin@Sun.COM
11438462SApril.Chin@Sun.COMtypeset -r do_getcomments_usage=$'+
1144*10898Sroland.mainz@nrubsig.org[-?\n@(#)\$Id: getcomments (Roland Mainz) 2009-05-09 \$\n]
11458462SApril.Chin@Sun.COM[-author?Roland Mainz <roland.mainz@sun.com>]
11468462SApril.Chin@Sun.COM[+NAME?getcomments - extract license information from source files]
11478462SApril.Chin@Sun.COM[+DESCRIPTION?\bgetcomments\b is a small utilty script which extracts
11488462SApril.Chin@Sun.COM	license information from the "\bgetcomments\b"-database
11498462SApril.Chin@Sun.COM	file created by \bcrawl\b. The script allows various
11508462SApril.Chin@Sun.COM	filters (see options below) to be applied on the database]
11518462SApril.Chin@Sun.COM[+?The license extraction is done in two steps - first a crawler script
11528462SApril.Chin@Sun.COM	called \bcrawl\b will scan all source files, extract
11538462SApril.Chin@Sun.COM	the comments and stores this information in a "database" file called
11548462SApril.Chin@Sun.COM	"crawlsrccomments_extracted_comments.cpv" and then \bextract_license\b allows
11558462SApril.Chin@Sun.COM	queries on this database.]
1156*10898Sroland.mainz@nrubsig.org[D:database?Database file for input (either file, http:// or https://-URL).]:[database]
11578462SApril.Chin@Sun.COM[l:acceptfilepattern?Process only files which match pattern.]:[pattern]
11588462SApril.Chin@Sun.COM[L:rejectfilepattern?Process only files which do not match pattern.]:[pattern]
11598462SApril.Chin@Sun.COM[c:acceptcommentpattern?Match comments which match pattern. Defaults to ~(Ei)(license|copyright)]:[pattern]
11608462SApril.Chin@Sun.COM[C:rejectcommentpattern?Discard comments which match pattern. Defaults to ""]:[pattern]
11618462SApril.Chin@Sun.COM[S:stats?Print statistics.]
11628462SApril.Chin@Sun.COM[Z:zapsimilar?Combine similar/duplicate comments in the report.]
11638462SApril.Chin@Sun.COM[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
11648462SApril.Chin@Sun.COM'
11658462SApril.Chin@Sun.COM
11668462SApril.Chin@Sun.COMtypeset -r do_crawl_usage=$'+
1167*10898Sroland.mainz@nrubsig.org[-?\n@(#)\$Id: crawl (Roland Mainz) 2009-05-09 \$\n]
11688462SApril.Chin@Sun.COM[-author?Roland Mainz <roland.mainz@sun.com>]
11698462SApril.Chin@Sun.COM[+NAME?crawl - crawl comment information from source files]
11708462SApril.Chin@Sun.COM[+DESCRIPTION?\bcrawl\b is a small utilty script which reads
11718462SApril.Chin@Sun.COM	a list of source code files from stdin, determinates the type of
11728462SApril.Chin@Sun.COM	syntax used by these files and then extracts
11738462SApril.Chin@Sun.COM	comments from the source code and stores this information into a
11748462SApril.Chin@Sun.COM	"database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then
11758462SApril.Chin@Sun.COM	be processed by \bextract_license\b or similar processing tools.]
11768462SApril.Chin@Sun.COM[S:scanmaxcharacters?Scan a maximum number of numchars characters for comments.
11778462SApril.Chin@Sun.COM	Defaults to 256K characters.]:[numchars]
11788462SApril.Chin@Sun.COM[N:maxnumcomments?Maximum numbers of comments to crawl. Defaults to "+Infinite"]:[numcomments]
11798462SApril.Chin@Sun.COM[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
11808462SApril.Chin@Sun.COM'
11818462SApril.Chin@Sun.COM
11828462SApril.Chin@Sun.COMtypeset -r crawlsrccomments_usage=$'+
1183*10898Sroland.mainz@nrubsig.org[-?\n@(#)\$Id: crawlsrccomments (Roland Mainz) 2009-05-09 \$\n]
11848462SApril.Chin@Sun.COM[-author?Roland Mainz <roland.mainz@sun.com>]
11858462SApril.Chin@Sun.COM[+NAME?crawlsrccomments - extract and filter comment information from source files]
11868462SApril.Chin@Sun.COM[+DESCRIPTION?\bcrawlsrccomments\b is a small utilty script which reads
11878462SApril.Chin@Sun.COM	a list of source code files from stdin, determinates the type of
11888462SApril.Chin@Sun.COM	syntax used by these files and then extracts
11898462SApril.Chin@Sun.COM	comments from the source code and stores this information into a
11908462SApril.Chin@Sun.COM	"database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then
11918462SApril.Chin@Sun.COM	be processed by \bextract_license\b or similar processing tools.]
11928462SApril.Chin@Sun.COM
11938462SApril.Chin@Sun.COM[crawl|getcomments] options
11948462SApril.Chin@Sun.COM
11958462SApril.Chin@Sun.COM[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
11968462SApril.Chin@Sun.COM'
11978462SApril.Chin@Sun.COM
11988462SApril.Chin@Sun.COM
11998462SApril.Chin@Sun.COM# program start
12008462SApril.Chin@Sun.COMbuiltin basename
12018462SApril.Chin@Sun.COMbuiltin cat
12028462SApril.Chin@Sun.COMbuiltin date
12038462SApril.Chin@Sun.COMbuiltin uname
12048462SApril.Chin@Sun.COMbuiltin rm
12058462SApril.Chin@Sun.COMbuiltin sum || fatal_error "sum builtin not found."
12068462SApril.Chin@Sun.COM
12078462SApril.Chin@Sun.COM# exit at the first error we hit
12088462SApril.Chin@Sun.COMset -o errexit
12098462SApril.Chin@Sun.COM
12108462SApril.Chin@Sun.COMtypeset progname="${ basename "${0}" ; }"
12118462SApril.Chin@Sun.COM
12128462SApril.Chin@Sun.COMwhile getopts -a "${progname}" "${crawlsrccomments_usage}" OPT ; do
12138462SApril.Chin@Sun.COM	# printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
12148462SApril.Chin@Sun.COM	case ${OPT} in
12158462SApril.Chin@Sun.COM		*)	usage crawlsrccomments_usage ;;
12168462SApril.Chin@Sun.COM	esac
12178462SApril.Chin@Sun.COMdone
12188462SApril.Chin@Sun.COMshift $((OPTIND-1))
12198462SApril.Chin@Sun.COM
12208462SApril.Chin@Sun.COMtypeset cmd="$1"
12218462SApril.Chin@Sun.COM
12228462SApril.Chin@Sun.COMcase "$cmd" in
12238462SApril.Chin@Sun.COM	"crawl")
12248462SApril.Chin@Sun.COM		progname+=" ${cmd}"
12258462SApril.Chin@Sun.COM		do_crawl "$@"
12268462SApril.Chin@Sun.COM		exit $?
12278462SApril.Chin@Sun.COM		;;
12288462SApril.Chin@Sun.COM	"getcomments")
12298462SApril.Chin@Sun.COM		progname+=" ${cmd}"
12308462SApril.Chin@Sun.COM		do_getcomments "$@"
12318462SApril.Chin@Sun.COM		exit $?
12328462SApril.Chin@Sun.COM		;;
12338462SApril.Chin@Sun.COM	*)
12348462SApril.Chin@Sun.COM		usage crawlsrccomments_usage
12358462SApril.Chin@Sun.COM		;;
12368462SApril.Chin@Sun.COMesac
12378462SApril.Chin@Sun.COM
12388462SApril.Chin@Sun.COMfatal_error "not reached."
12398462SApril.Chin@Sun.COM# EOF.
1240