18462SApril.Chin@Sun.COM#!/usr/bin/ksh93 28462SApril.Chin@Sun.COM 38462SApril.Chin@Sun.COM# 48462SApril.Chin@Sun.COM# CDDL HEADER START 58462SApril.Chin@Sun.COM# 68462SApril.Chin@Sun.COM# The contents of this file are subject to the terms of the 78462SApril.Chin@Sun.COM# Common Development and Distribution License (the "License"). 88462SApril.Chin@Sun.COM# You may not use this file except in compliance with the License. 98462SApril.Chin@Sun.COM# 108462SApril.Chin@Sun.COM# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 118462SApril.Chin@Sun.COM# or http://www.opensolaris.org/os/licensing. 128462SApril.Chin@Sun.COM# See the License for the specific language governing permissions 138462SApril.Chin@Sun.COM# and limitations under the License. 148462SApril.Chin@Sun.COM# 158462SApril.Chin@Sun.COM# When distributing Covered Code, include this CDDL HEADER in each 168462SApril.Chin@Sun.COM# file and include the License file at usr/src/OPENSOLARIS.LICENSE. 178462SApril.Chin@Sun.COM# If applicable, add the following below this CDDL HEADER, with the 188462SApril.Chin@Sun.COM# fields enclosed by brackets "[]" replaced with your own identifying 198462SApril.Chin@Sun.COM# information: Portions Copyright [yyyy] [name of copyright owner] 208462SApril.Chin@Sun.COM# 218462SApril.Chin@Sun.COM# CDDL HEADER END 228462SApril.Chin@Sun.COM# 238462SApril.Chin@Sun.COM 248462SApril.Chin@Sun.COM# 25*12068SRoger.Faulkner@Oracle.COM# Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. 268462SApril.Chin@Sun.COM# 278462SApril.Chin@Sun.COM 288462SApril.Chin@Sun.COM# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant 298462SApril.Chin@Sun.COMexport PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin 308462SApril.Chin@Sun.COM 318462SApril.Chin@Sun.COM# Make sure all math stuff runs in the "C" locale to avoid problems 328462SApril.Chin@Sun.COM# with alternative # radix point representations (e.g. ',' instead of 338462SApril.Chin@Sun.COM# '.' in de_DE.*-locales). This needs to be set _before_ any 348462SApril.Chin@Sun.COM# floating-point constants are defined in this script). 358462SApril.Chin@Sun.COMif [[ "${LC_ALL}" != "" ]] ; then 368462SApril.Chin@Sun.COM export \ 378462SApril.Chin@Sun.COM LC_MONETARY="${LC_ALL}" \ 388462SApril.Chin@Sun.COM LC_MESSAGES="${LC_ALL}" \ 398462SApril.Chin@Sun.COM LC_COLLATE="${LC_ALL}" \ 408462SApril.Chin@Sun.COM LC_CTYPE="${LC_ALL}" 418462SApril.Chin@Sun.COM unset LC_ALL 428462SApril.Chin@Sun.COMfi 438462SApril.Chin@Sun.COMexport LC_NUMERIC=C 448462SApril.Chin@Sun.COM 458462SApril.Chin@Sun.COM# constants values for tokenizer/parser stuff 4610898Sroland.mainz@nrubsig.orgcompound -r ch=( 478462SApril.Chin@Sun.COM newline=$'\n' 488462SApril.Chin@Sun.COM tab=$'\t' 498462SApril.Chin@Sun.COM formfeed=$'\f' 508462SApril.Chin@Sun.COM) 518462SApril.Chin@Sun.COM 528462SApril.Chin@Sun.COMfunction fatal_error 538462SApril.Chin@Sun.COM{ 548462SApril.Chin@Sun.COM print -u2 "${progname}: $*" 558462SApril.Chin@Sun.COM exit 1 568462SApril.Chin@Sun.COM} 578462SApril.Chin@Sun.COM 588462SApril.Chin@Sun.COMfunction printmsg 598462SApril.Chin@Sun.COM{ 608462SApril.Chin@Sun.COM print -u2 "$*" 618462SApril.Chin@Sun.COM} 628462SApril.Chin@Sun.COM 638462SApril.Chin@Sun.COM 648462SApril.Chin@Sun.COMfunction attrstrtoattrarray 658462SApril.Chin@Sun.COM{ 668462SApril.Chin@Sun.COM#set -o xtrace 678462SApril.Chin@Sun.COM typeset s="$1" 688462SApril.Chin@Sun.COM nameref aa=$2 # attribute array 698462SApril.Chin@Sun.COM integer aa_count=0 708462SApril.Chin@Sun.COM integer aa_count=0 718462SApril.Chin@Sun.COM typeset nextattr 728462SApril.Chin@Sun.COM integer currattrlen=0 738462SApril.Chin@Sun.COM typeset tagstr 748462SApril.Chin@Sun.COM typeset tagval 758462SApril.Chin@Sun.COM 768462SApril.Chin@Sun.COM while (( ${#s} > 0 )) ; do 778462SApril.Chin@Sun.COM # skip whitespaces 788462SApril.Chin@Sun.COM while [[ "${s:currattrlen:1}" == ~(E)[[:blank:][:space:]] ]] ; do 798462SApril.Chin@Sun.COM (( currattrlen++ )) 808462SApril.Chin@Sun.COM done 818462SApril.Chin@Sun.COM s="${s:currattrlen:${#s}}" 828462SApril.Chin@Sun.COM 838462SApril.Chin@Sun.COM # anything left ? 848462SApril.Chin@Sun.COM (( ${#s} == 0 )) && break 858462SApril.Chin@Sun.COM 868462SApril.Chin@Sun.COM # Pattern tests: 878462SApril.Chin@Sun.COM #x="foo=bar huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=[^[:blank:]\"]*}" 888462SApril.Chin@Sun.COM #x='foo="ba=r o" huz=123' ; print "${x##~(E)[[:alnum:]_-:]*=\"[^\"]*\"}" 898462SApril.Chin@Sun.COM #x="foo='ba=r o' huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=\'[^\"]*\'}" 908462SApril.Chin@Sun.COM #x="foox huz=123" ; print "${x##~(E)[[:alnum:]_-:]*}" 918462SApril.Chin@Sun.COM # All pattern combined via eregex (w|x|y|z): 928462SApril.Chin@Sun.COM #x='foo="bar=o" huz=123' ; print "${x##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\')}" 938462SApril.Chin@Sun.COM nextattr="${s##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\'|[[:alnum:]_-:]*)}" 948462SApril.Chin@Sun.COM currattrlen=$(( ${#s} - ${#nextattr})) 958462SApril.Chin@Sun.COM 968462SApril.Chin@Sun.COM # add entry 978462SApril.Chin@Sun.COM tagstr="${s:0:currattrlen}" 988462SApril.Chin@Sun.COM if [[ "${tagstr}" == *=* ]] ; then 998462SApril.Chin@Sun.COM # normal case: attribute with value 1008462SApril.Chin@Sun.COM 1018462SApril.Chin@Sun.COM tagval="${tagstr#*=}" 1028462SApril.Chin@Sun.COM 1038462SApril.Chin@Sun.COM # strip quotes ('' or "") 1048462SApril.Chin@Sun.COM if [[ "${tagval}" == ~(Elr)(\'.*\'|\".*\") ]] ; then 1058462SApril.Chin@Sun.COM tagval="${tagval:1:${#tagval}-2}" 1068462SApril.Chin@Sun.COM fi 1078462SApril.Chin@Sun.COM 1088462SApril.Chin@Sun.COM aa[${aa_count}]=( name="${tagstr%%=*}" value="${tagval}" ) 1098462SApril.Chin@Sun.COM else 1108462SApril.Chin@Sun.COM # special case for HTML where you have something like <foo baz> 1118462SApril.Chin@Sun.COM aa[${aa_count}]=( name="${tagstr}" ) 1128462SApril.Chin@Sun.COM fi 1138462SApril.Chin@Sun.COM (( aa_count++ )) 1148462SApril.Chin@Sun.COM (( aa_count > 1000 )) && fatal_error "$0: aa_count too large" # assert 1158462SApril.Chin@Sun.COM done 1168462SApril.Chin@Sun.COM} 1178462SApril.Chin@Sun.COM 1188462SApril.Chin@Sun.COM# XML document handler 1198462SApril.Chin@Sun.COMfunction handle_xml_document 1208462SApril.Chin@Sun.COM{ 1218462SApril.Chin@Sun.COM#set -o xtrace 1228462SApril.Chin@Sun.COM nameref callbacks=${1} 1238462SApril.Chin@Sun.COM typeset tag_type="${2}" 1248462SApril.Chin@Sun.COM typeset tag_value="${3}" 1258462SApril.Chin@Sun.COM typeset tag_attributes="${4}" 1268462SApril.Chin@Sun.COM nameref doc=${callbacks["arg_tree"]} 1278462SApril.Chin@Sun.COM nameref nodepath="${stack.items[stack.pos]}" 1288462SApril.Chin@Sun.COM nameref nodesnum="${stack.items[stack.pos]}num" 1298462SApril.Chin@Sun.COM 1308462SApril.Chin@Sun.COM case "${tag_type}" in 1318462SApril.Chin@Sun.COM tag_comment) 1328462SApril.Chin@Sun.COM nodepath[${nodesnum}]+=( 1338462SApril.Chin@Sun.COM typeset tagtype="comment" 1348462SApril.Chin@Sun.COM typeset tagvalue="${tag_value}" 1358462SApril.Chin@Sun.COM ) 1368462SApril.Chin@Sun.COM (( nodesnum++ )) 1378462SApril.Chin@Sun.COM ;; 1388462SApril.Chin@Sun.COM esac 1398462SApril.Chin@Sun.COM 1408462SApril.Chin@Sun.COM# print "xmltok: '${tag_type}' = '${tag_value}'" 1418462SApril.Chin@Sun.COM} 1428462SApril.Chin@Sun.COM 1438462SApril.Chin@Sun.COMfunction xml_tok 1448462SApril.Chin@Sun.COM{ 1458462SApril.Chin@Sun.COM typeset buf="" 1468462SApril.Chin@Sun.COM typeset namebuf="" 1478462SApril.Chin@Sun.COM typeset attrbuf="" 1488462SApril.Chin@Sun.COM typeset c="" 1498462SApril.Chin@Sun.COM typeset isendtag # bool: true/false 1508462SApril.Chin@Sun.COM typeset issingletag # bool: true/false (used for tags like "<br />") 1518462SApril.Chin@Sun.COM nameref callbacks=${1} 1528462SApril.Chin@Sun.COM 1538462SApril.Chin@Sun.COM [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start" 1548462SApril.Chin@Sun.COM 1558462SApril.Chin@Sun.COM while IFS='' read -r -N 1 c ; do 1568462SApril.Chin@Sun.COM isendtag=false 1578462SApril.Chin@Sun.COM 1588462SApril.Chin@Sun.COM if [[ "$c" == "<" ]] ; then 1598462SApril.Chin@Sun.COM # flush any text content 1608462SApril.Chin@Sun.COM if [[ "$buf" != "" ]] ; then 1618462SApril.Chin@Sun.COM [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf" 1628462SApril.Chin@Sun.COM buf="" 1638462SApril.Chin@Sun.COM fi 1648462SApril.Chin@Sun.COM 1658462SApril.Chin@Sun.COM IFS='' read -r -N 1 c 1668462SApril.Chin@Sun.COM if [[ "$c" == "/" ]] ; then 1678462SApril.Chin@Sun.COM isendtag=true 1688462SApril.Chin@Sun.COM else 1698462SApril.Chin@Sun.COM buf="$c" 1708462SApril.Chin@Sun.COM fi 1718462SApril.Chin@Sun.COM IFS='' read -r -d '>' c 1728462SApril.Chin@Sun.COM buf+="$c" 1738462SApril.Chin@Sun.COM 1748462SApril.Chin@Sun.COM # handle comments 1758462SApril.Chin@Sun.COM if [[ "$buf" == ~(El)!-- ]] ; then 1768462SApril.Chin@Sun.COM # did we read the comment completely ? 1778462SApril.Chin@Sun.COM if [[ "$buf" != ~(Elr)!--.*-- ]] ; then 1788462SApril.Chin@Sun.COM buf+=">" 1798462SApril.Chin@Sun.COM while [[ "$buf" != ~(Elr)!--.*-- ]] ; do 1808462SApril.Chin@Sun.COM IFS='' read -r -N 1 c || break 1818462SApril.Chin@Sun.COM buf+="$c" 1828462SApril.Chin@Sun.COM done 1838462SApril.Chin@Sun.COM fi 1848462SApril.Chin@Sun.COM 1858462SApril.Chin@Sun.COM [[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}" 1868462SApril.Chin@Sun.COM buf="" 1878462SApril.Chin@Sun.COM continue 1888462SApril.Chin@Sun.COM fi 1898462SApril.Chin@Sun.COM 1908462SApril.Chin@Sun.COM # check if the tag starts and ends at the same time (like "<br />") 1918462SApril.Chin@Sun.COM if [[ "${buf}" == ~(Er).*/ ]] ; then 1928462SApril.Chin@Sun.COM issingletag=true 1938462SApril.Chin@Sun.COM buf="${buf%*/}" 1948462SApril.Chin@Sun.COM else 1958462SApril.Chin@Sun.COM issingletag=false 1968462SApril.Chin@Sun.COM fi 1978462SApril.Chin@Sun.COM 1988462SApril.Chin@Sun.COM # check if the tag has attributes (e.g. space after name) 1998462SApril.Chin@Sun.COM if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then 2008462SApril.Chin@Sun.COM namebuf="${buf%%~(E)[[:space:][:blank:]].*}" 2018462SApril.Chin@Sun.COM attrbuf="${buf#~(E).*[[:space:][:blank:]]}" 2028462SApril.Chin@Sun.COM else 2038462SApril.Chin@Sun.COM namebuf="$buf" 2048462SApril.Chin@Sun.COM attrbuf="" 2058462SApril.Chin@Sun.COM fi 2068462SApril.Chin@Sun.COM 2078462SApril.Chin@Sun.COM if ${isendtag} ; then 2088462SApril.Chin@Sun.COM [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf" 2098462SApril.Chin@Sun.COM else 2108462SApril.Chin@Sun.COM [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf" 2118462SApril.Chin@Sun.COM 2128462SApril.Chin@Sun.COM # handle tags like <br/> (which are start- and end-tag in one piece) 2138462SApril.Chin@Sun.COM if ${issingletag} ; then 2148462SApril.Chin@Sun.COM [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf" 2158462SApril.Chin@Sun.COM fi 2168462SApril.Chin@Sun.COM fi 2178462SApril.Chin@Sun.COM buf="" 2188462SApril.Chin@Sun.COM else 2198462SApril.Chin@Sun.COM buf+="$c" 2208462SApril.Chin@Sun.COM fi 2218462SApril.Chin@Sun.COM done 2228462SApril.Chin@Sun.COM 2238462SApril.Chin@Sun.COM [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success" 2248462SApril.Chin@Sun.COM 2258462SApril.Chin@Sun.COM print # final newline to make filters like "sed" happy 2268462SApril.Chin@Sun.COM} 2278462SApril.Chin@Sun.COM 2288462SApril.Chin@Sun.COM# enumerate comments in a shell (or shell-like) script 2298462SApril.Chin@Sun.COMfunction enumerate_comments_shell 2308462SApril.Chin@Sun.COM{ 2318462SApril.Chin@Sun.COM set -o errexit 2328462SApril.Chin@Sun.COM 2338462SApril.Chin@Sun.COM typeset input_file="$1" 2348462SApril.Chin@Sun.COM nameref comment_array="$2" 2358462SApril.Chin@Sun.COM integer max_num_comments="$3" 2368462SApril.Chin@Sun.COM integer ca=0 # index in "comment_array" 2378462SApril.Chin@Sun.COM 2388462SApril.Chin@Sun.COM integer res=0 2398462SApril.Chin@Sun.COM 2408462SApril.Chin@Sun.COM typeset comment="" 2418462SApril.Chin@Sun.COM 2428462SApril.Chin@Sun.COM while (( res == 0 )) ; do 2438462SApril.Chin@Sun.COM IFS='' read -r line 2448462SApril.Chin@Sun.COM (( res=$? )) 2458462SApril.Chin@Sun.COM 2468462SApril.Chin@Sun.COM if [[ "${line}" == ~(El)#.* ]] ; then 2478462SApril.Chin@Sun.COM comment+="${line#\#}${ch.newline}" 2488462SApril.Chin@Sun.COM else 2498462SApril.Chin@Sun.COM if [[ "$comment" != "" ]] ; then 2508462SApril.Chin@Sun.COM comment_array[ca++]="${comment}" 2518462SApril.Chin@Sun.COM comment="" 2528462SApril.Chin@Sun.COM 2538462SApril.Chin@Sun.COM if (( ca > max_num_comments )) ; then 2548462SApril.Chin@Sun.COM break 2558462SApril.Chin@Sun.COM fi 2568462SApril.Chin@Sun.COM fi 2578462SApril.Chin@Sun.COM fi 2588462SApril.Chin@Sun.COM done <"${input_file}" 2598462SApril.Chin@Sun.COM 2608462SApril.Chin@Sun.COM return 0 2618462SApril.Chin@Sun.COM} 2628462SApril.Chin@Sun.COM 2638462SApril.Chin@Sun.COM 2648462SApril.Chin@Sun.COM# enumerate comments in a troff document 2658462SApril.Chin@Sun.COMfunction enumerate_comments_troff 2668462SApril.Chin@Sun.COM{ 2678462SApril.Chin@Sun.COM set -o errexit 2688462SApril.Chin@Sun.COM 2698462SApril.Chin@Sun.COM typeset input_file="$1" 2708462SApril.Chin@Sun.COM nameref comment_array="$2" 2718462SApril.Chin@Sun.COM integer max_num_comments="$3" 2728462SApril.Chin@Sun.COM integer ca=0 # index in "comment_array" 2738462SApril.Chin@Sun.COM 2748462SApril.Chin@Sun.COM integer res=0 2758462SApril.Chin@Sun.COM 2768462SApril.Chin@Sun.COM typeset comment="" 2778462SApril.Chin@Sun.COM 2788462SApril.Chin@Sun.COM while (( res == 0 )) ; do 2798462SApril.Chin@Sun.COM IFS='' read -r line 2808462SApril.Chin@Sun.COM (( res=$? )) 2818462SApril.Chin@Sun.COM 2828462SApril.Chin@Sun.COM if [[ "${line}" == ~(El)\.*\\\" ]] ; then 2838462SApril.Chin@Sun.COM comment+="${line#~(El)\.*\\\"}${ch.newline}" 2848462SApril.Chin@Sun.COM else 2858462SApril.Chin@Sun.COM if [[ "$comment" != "" ]] ; then 2868462SApril.Chin@Sun.COM comment_array[ca++]="${comment}" 2878462SApril.Chin@Sun.COM comment="" 2888462SApril.Chin@Sun.COM 2898462SApril.Chin@Sun.COM if (( ca > max_num_comments )) ; then 2908462SApril.Chin@Sun.COM break 2918462SApril.Chin@Sun.COM fi 2928462SApril.Chin@Sun.COM fi 2938462SApril.Chin@Sun.COM fi 2948462SApril.Chin@Sun.COM done <"${input_file}" 2958462SApril.Chin@Sun.COM 2968462SApril.Chin@Sun.COM return 0 2978462SApril.Chin@Sun.COM} 2988462SApril.Chin@Sun.COM 2998462SApril.Chin@Sun.COM 3008462SApril.Chin@Sun.COM# enumerate comments in files which are preprocessed by 3018462SApril.Chin@Sun.COM# CPP (e.g. C, C++, Imakefile etc.) 3028462SApril.Chin@Sun.COMfunction enumerate_comments_cpp 3038462SApril.Chin@Sun.COM{ 3048462SApril.Chin@Sun.COM set -o errexit 3058462SApril.Chin@Sun.COM# set -o nounset 3068462SApril.Chin@Sun.COM 3078462SApril.Chin@Sun.COM integer err=0 3088462SApril.Chin@Sun.COM 3098462SApril.Chin@Sun.COM typeset input_file="$1" 3108462SApril.Chin@Sun.COM nameref comment_array="$2" 3118462SApril.Chin@Sun.COM integer max_num_comments="$3" 3128462SApril.Chin@Sun.COM integer max_filesize_for_scan="$4" 3138462SApril.Chin@Sun.COM integer ca=0 # index in "comment_array" 3148462SApril.Chin@Sun.COM 3158462SApril.Chin@Sun.COM typeset content 3168462SApril.Chin@Sun.COM integer content_length 3178462SApril.Chin@Sun.COM 3188462SApril.Chin@Sun.COM integer file_pos # file position 31910898Sroland.mainz@nrubsig.org compound line_pos=( 3208462SApril.Chin@Sun.COM integer x=0 # X position in line 3218462SApril.Chin@Sun.COM integer y=0 # Y position in line (line number) 3228462SApril.Chin@Sun.COM ) 3238462SApril.Chin@Sun.COM typeset c c2 3248462SApril.Chin@Sun.COM 3258462SApril.Chin@Sun.COM typeset comment 3268462SApril.Chin@Sun.COM 32710898Sroland.mainz@nrubsig.org compound state=( 3288462SApril.Chin@Sun.COM # C comment state 3298462SApril.Chin@Sun.COM typeset in_c_comment=false 3308462SApril.Chin@Sun.COM # C++ comment state 33110898Sroland.mainz@nrubsig.org compound cxx=( 3328462SApril.Chin@Sun.COM typeset in_comment=false 3338462SApril.Chin@Sun.COM typeset comment_continued=false 3348462SApril.Chin@Sun.COM # position of current //-pos 33510898Sroland.mainz@nrubsig.org compound comment_pos=( 3368462SApril.Chin@Sun.COM integer x=-1 3378462SApril.Chin@Sun.COM integer y=-1 3388462SApril.Chin@Sun.COM ) 3398462SApril.Chin@Sun.COM # position of previous //-pos 34010898Sroland.mainz@nrubsig.org compound comment_prev_pos=( 3418462SApril.Chin@Sun.COM integer x=-1 3428462SApril.Chin@Sun.COM integer y=-1 3438462SApril.Chin@Sun.COM ) 3448462SApril.Chin@Sun.COM ) 3458462SApril.Chin@Sun.COM # literal state 3468462SApril.Chin@Sun.COM typeset in_sq_literal=false # single-quote literal 3478462SApril.Chin@Sun.COM typeset in_dq_literal=false # double-quote literal 3488462SApril.Chin@Sun.COM ) 3498462SApril.Chin@Sun.COM 3508462SApril.Chin@Sun.COM content="$(< "${input_file}")" 3518462SApril.Chin@Sun.COM 3528462SApril.Chin@Sun.COM # Truncate file to "max_filesize_for_scan" charatcters. 3538462SApril.Chin@Sun.COM # This was originally added to work around a performance problem with 3548462SApril.Chin@Sun.COM # the ${str:offset:chunksize} operator which scales badly in ksh93 3558462SApril.Chin@Sun.COM # version 's' with the number of characters 3568462SApril.Chin@Sun.COM if (( ${#content} > max_filesize_for_scan )) ; then 3578462SApril.Chin@Sun.COM print -u2 -f "## WARNING: File '%s' truncated to %d characters\n" \ 3588462SApril.Chin@Sun.COM "${input_file}" \ 3598462SApril.Chin@Sun.COM max_filesize_for_scan 3608462SApril.Chin@Sun.COM content="${content:0:max_filesize_for_scan}" 3618462SApril.Chin@Sun.COM fi 3628462SApril.Chin@Sun.COM content_length=${#content} 3638462SApril.Chin@Sun.COM 3648462SApril.Chin@Sun.COM # Iterate through the source code. The last character 3658462SApril.Chin@Sun.COM # (when file_pos == content_length) will be empty to indicate 3668462SApril.Chin@Sun.COM # EOF (this is needed for cases like when 3678462SApril.Chin@Sun.COM # a C++ comment is not terminated by a newline... ;-/) 3688462SApril.Chin@Sun.COM for (( file_pos=0 ; file_pos <= content_length ; file_pos++ )) ; do 3698462SApril.Chin@Sun.COM c2="${content:file_pos:2}" 3708462SApril.Chin@Sun.COM c="${c2:0:1}" 3718462SApril.Chin@Sun.COM 3728462SApril.Chin@Sun.COM if [[ "$c" == "${ch.newline}" ]] ; then 3738462SApril.Chin@Sun.COM (( line_pos.x=0, line_pos.y++ )) 3748462SApril.Chin@Sun.COM else 3758462SApril.Chin@Sun.COM (( line_pos.x++ )) 3768462SApril.Chin@Sun.COM fi 3778462SApril.Chin@Sun.COM 3788462SApril.Chin@Sun.COM if ${state.in_c_comment} ; then 3798462SApril.Chin@Sun.COM if [[ "$c2" == "*/" ]] ; then 3808462SApril.Chin@Sun.COM (( file_pos++, line_pos.x++ )) 3818462SApril.Chin@Sun.COM state.in_c_comment=false 3828462SApril.Chin@Sun.COM 3838462SApril.Chin@Sun.COM # flush comment text 3848462SApril.Chin@Sun.COM comment_array[ca++]="${comment}" 3858462SApril.Chin@Sun.COM comment="" 3868462SApril.Chin@Sun.COM 3878462SApril.Chin@Sun.COM if (( ca > max_num_comments )) ; then 3888462SApril.Chin@Sun.COM break 3898462SApril.Chin@Sun.COM fi 3908462SApril.Chin@Sun.COM else 3918462SApril.Chin@Sun.COM comment+="$c" 3928462SApril.Chin@Sun.COM fi 3938462SApril.Chin@Sun.COM elif ${state.cxx.in_comment} ; then 3948462SApril.Chin@Sun.COM if [[ "$c" == "${ch.newline}" || "$c" == "" ]] ; then 3958462SApril.Chin@Sun.COM state.cxx.in_comment=false 3968462SApril.Chin@Sun.COM 3978462SApril.Chin@Sun.COM # flush comment text 3988462SApril.Chin@Sun.COM if ${state.cxx.comment_continued} ; then 3998462SApril.Chin@Sun.COM comment_array[ca-1]+="${ch.newline}${comment}" 4008462SApril.Chin@Sun.COM (( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x , 4018462SApril.Chin@Sun.COM state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y )) 4028462SApril.Chin@Sun.COM else 4038462SApril.Chin@Sun.COM comment_array[ca++]="${comment}" 4048462SApril.Chin@Sun.COM (( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x , 4058462SApril.Chin@Sun.COM state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y )) 4068462SApril.Chin@Sun.COM fi 4078462SApril.Chin@Sun.COM comment="" 4088462SApril.Chin@Sun.COM 4098462SApril.Chin@Sun.COM if (( ca > max_num_comments )) ; then 4108462SApril.Chin@Sun.COM break 4118462SApril.Chin@Sun.COM fi 4128462SApril.Chin@Sun.COM else 4138462SApril.Chin@Sun.COM comment+="$c" 4148462SApril.Chin@Sun.COM fi 4158462SApril.Chin@Sun.COM elif ${state.in_sq_literal} ; then 4168462SApril.Chin@Sun.COM if [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then 4178462SApril.Chin@Sun.COM state.in_sq_literal=false 4188462SApril.Chin@Sun.COM fi 4198462SApril.Chin@Sun.COM elif ${state.in_dq_literal} ; then 4208462SApril.Chin@Sun.COM if [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then 4218462SApril.Chin@Sun.COM state.in_dq_literal=false 4228462SApril.Chin@Sun.COM fi 4238462SApril.Chin@Sun.COM else 4248462SApril.Chin@Sun.COM if [[ "$c2" == "/*" ]] ; then 4258462SApril.Chin@Sun.COM (( file_pos++, line_pos.x++ )) 4268462SApril.Chin@Sun.COM state.in_c_comment=true 4278462SApril.Chin@Sun.COM comment="" 4288462SApril.Chin@Sun.COM elif [[ "$c2" == "//" ]] ; then 4298462SApril.Chin@Sun.COM (( file_pos++, line_pos.x++ )) 4308462SApril.Chin@Sun.COM if (( state.cxx.comment_prev_pos.x == line_pos.x && \ 4318462SApril.Chin@Sun.COM state.cxx.comment_prev_pos.y == (line_pos.y-1) )) ; then 4328462SApril.Chin@Sun.COM state.cxx.comment_continued=true 4338462SApril.Chin@Sun.COM else 4348462SApril.Chin@Sun.COM state.cxx.comment_continued=false 4358462SApril.Chin@Sun.COM fi 4368462SApril.Chin@Sun.COM (( state.cxx.comment_pos.x=line_pos.x , state.cxx.comment_pos.y=line_pos.y )) 4378462SApril.Chin@Sun.COM state.cxx.in_comment=true 4388462SApril.Chin@Sun.COM comment="" 4398462SApril.Chin@Sun.COM elif [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then 4408462SApril.Chin@Sun.COM state.in_sq_literal=true 4418462SApril.Chin@Sun.COM elif [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then 4428462SApril.Chin@Sun.COM state.in_dq_literal=true 4438462SApril.Chin@Sun.COM fi 4448462SApril.Chin@Sun.COM fi 4458462SApril.Chin@Sun.COM done 4468462SApril.Chin@Sun.COM 4478462SApril.Chin@Sun.COM if [[ "$comment" != "" ]] ; then 4488462SApril.Chin@Sun.COM print -u2 "## ERROR: Comment text buffer not empty at EOF." 4498462SApril.Chin@Sun.COM err=1 4508462SApril.Chin@Sun.COM fi 4518462SApril.Chin@Sun.COM 4528462SApril.Chin@Sun.COM if ${state.in_c_comment} ; then 4538462SApril.Chin@Sun.COM print -u2 "## ERROR: C comment did not close before EOF." 4548462SApril.Chin@Sun.COM err=1 4558462SApril.Chin@Sun.COM fi 4568462SApril.Chin@Sun.COM 4578462SApril.Chin@Sun.COM if ${state.cxx.in_comment} ; then 4588462SApril.Chin@Sun.COM print -u2 "## ERROR: C++ comment did not close before EOF." 4598462SApril.Chin@Sun.COM err=1 4608462SApril.Chin@Sun.COM fi 4618462SApril.Chin@Sun.COM 4628462SApril.Chin@Sun.COM if ${state.in_dq_literal} ; then 4638462SApril.Chin@Sun.COM print -u2 "## ERROR: Double-quoted literal did not close before EOF." 4648462SApril.Chin@Sun.COM err=1 4658462SApril.Chin@Sun.COM fi 4668462SApril.Chin@Sun.COM 4678462SApril.Chin@Sun.COM # We treat this one only as warning since things like "foo.html.cpp" may 4688462SApril.Chin@Sun.COM # trigger this condition accidently 4698462SApril.Chin@Sun.COM if ${state.in_sq_literal} ; then 4708462SApril.Chin@Sun.COM print -u2 "## WARNING: Single-quoted literal did not close before EOF." 4718462SApril.Chin@Sun.COM fi 4728462SApril.Chin@Sun.COM 4738462SApril.Chin@Sun.COM return $err 4748462SApril.Chin@Sun.COM} 4758462SApril.Chin@Sun.COM 4768462SApril.Chin@Sun.COM# determine file type 4778462SApril.Chin@Sun.COMfunction get_file_format 4788462SApril.Chin@Sun.COM{ 4798462SApril.Chin@Sun.COM set -o errexit 4808462SApril.Chin@Sun.COM 4818462SApril.Chin@Sun.COM typeset filename="$1" 4828462SApril.Chin@Sun.COM nameref file_format="$2" 4838462SApril.Chin@Sun.COM 4848462SApril.Chin@Sun.COM typeset fileeval # evaluation result of /usr/bin/file 4858462SApril.Chin@Sun.COM 4868462SApril.Chin@Sun.COM # check whether "filename" is a plain, readable file 4878462SApril.Chin@Sun.COM [[ ! -f "$filename" ]] && return 1 4888462SApril.Chin@Sun.COM [[ ! -r "$filename" ]] && return 1 4898462SApril.Chin@Sun.COM 4908462SApril.Chin@Sun.COM # In theory this code would exclusively look at the contents of 4918462SApril.Chin@Sun.COM # the file to figure out it's file format - unfortunately 4928462SApril.Chin@Sun.COM # /usr/bin/file is virtually useless (the heuristics, matching 4938462SApril.Chin@Sun.COM # and output unreliable) for many file formats and therefore 4948462SApril.Chin@Sun.COM # we have to do a multi-stage approach which looks 4958462SApril.Chin@Sun.COM # at the file's content if possible and at the filename 4968462SApril.Chin@Sun.COM # otherwise. Fun... ;-( 4978462SApril.Chin@Sun.COM 4988462SApril.Chin@Sun.COM # pass one: Find matches for file formats where /usr/bin/file 4998462SApril.Chin@Sun.COM # is known to be unreliable: 5008462SApril.Chin@Sun.COM case "$filename" in 5018462SApril.Chin@Sun.COM *.[ch] | *.cpp | *.cc | *.cxx | *.hxx) 5028462SApril.Chin@Sun.COM file_format="c_source" 5038462SApril.Chin@Sun.COM return 0 5048462SApril.Chin@Sun.COM ;; 5058462SApril.Chin@Sun.COM *Imakefile) 5068462SApril.Chin@Sun.COM file_format="imakefile" 5078462SApril.Chin@Sun.COM return 0 5088462SApril.Chin@Sun.COM ;; 5098462SApril.Chin@Sun.COM *Makefile) 5108462SApril.Chin@Sun.COM file_format="makefile" 5118462SApril.Chin@Sun.COM return 0 5128462SApril.Chin@Sun.COM ;; 5138462SApril.Chin@Sun.COM esac 5148462SApril.Chin@Sun.COM 5158462SApril.Chin@Sun.COM # pass two: match by file content via /usr/bin/file 5168462SApril.Chin@Sun.COM fileeval="$(LC_ALL=C /usr/bin/file "$filename")" 5178462SApril.Chin@Sun.COM case "$fileeval" in 5188462SApril.Chin@Sun.COM ~(E)roff) 5198462SApril.Chin@Sun.COM file_format="troff" 5208462SApril.Chin@Sun.COM return 0 5218462SApril.Chin@Sun.COM ;; 5228462SApril.Chin@Sun.COM ~(E)html\ document) 5238462SApril.Chin@Sun.COM file_format="html" 5248462SApril.Chin@Sun.COM return 0 5258462SApril.Chin@Sun.COM ;; 5268462SApril.Chin@Sun.COM ~(E)sgml\ document) 5278462SApril.Chin@Sun.COM file_format="sgml" 5288462SApril.Chin@Sun.COM return 0 5298462SApril.Chin@Sun.COM ;; 5308462SApril.Chin@Sun.COM ~(E)executable.*(shell|(/|/r|/pf)(sh|ksh|ksh93|rksh93|dtksh|tksh|bash))\ script) 5318462SApril.Chin@Sun.COM file_format="shell" 5328462SApril.Chin@Sun.COM return 0 5338462SApril.Chin@Sun.COM ;; 5348462SApril.Chin@Sun.COM ~(E)executable.*/perl\ script) 5358462SApril.Chin@Sun.COM file_format="perl" 5368462SApril.Chin@Sun.COM return 0 5378462SApril.Chin@Sun.COM ;; 5388462SApril.Chin@Sun.COM esac 5398462SApril.Chin@Sun.COM 5408462SApril.Chin@Sun.COM # pass three: fallhack to filename matching 5418462SApril.Chin@Sun.COM case "$filename" in 5428462SApril.Chin@Sun.COM *.man) 5438462SApril.Chin@Sun.COM file_format="troff" 5448462SApril.Chin@Sun.COM return 0 5458462SApril.Chin@Sun.COM ;; 5468462SApril.Chin@Sun.COM *.html) 5478462SApril.Chin@Sun.COM file_format="html" 5488462SApril.Chin@Sun.COM return 0 5498462SApril.Chin@Sun.COM ;; 5508462SApril.Chin@Sun.COM *.sgml) 5518462SApril.Chin@Sun.COM file_format="sgml" 5528462SApril.Chin@Sun.COM return 0 5538462SApril.Chin@Sun.COM ;; 5548462SApril.Chin@Sun.COM *.xml) 5558462SApril.Chin@Sun.COM file_format="xml" 5568462SApril.Chin@Sun.COM return 0 5578462SApril.Chin@Sun.COM ;; 5588462SApril.Chin@Sun.COM *.png) 5598462SApril.Chin@Sun.COM file_format="image_png" 5608462SApril.Chin@Sun.COM return 0 5618462SApril.Chin@Sun.COM ;; 5628462SApril.Chin@Sun.COM *.xcf) 5638462SApril.Chin@Sun.COM file_format="image_xcf" 5648462SApril.Chin@Sun.COM return 0 5658462SApril.Chin@Sun.COM ;; 5668462SApril.Chin@Sun.COM *.shar) 5678462SApril.Chin@Sun.COM file_format="archive_shell" 5688462SApril.Chin@Sun.COM return 0 5698462SApril.Chin@Sun.COM ;; 5708462SApril.Chin@Sun.COM *.sh) 5718462SApril.Chin@Sun.COM file_format="shell" 5728462SApril.Chin@Sun.COM return 0 5738462SApril.Chin@Sun.COM ;; 5748462SApril.Chin@Sun.COM *.pcf) 5758462SApril.Chin@Sun.COM file_format="font_pcf" 5768462SApril.Chin@Sun.COM return 0 5778462SApril.Chin@Sun.COM ;; 5788462SApril.Chin@Sun.COM *.bdf) 5798462SApril.Chin@Sun.COM file_format="font_bdf" 5808462SApril.Chin@Sun.COM return 0 5818462SApril.Chin@Sun.COM ;; 5828462SApril.Chin@Sun.COM *.pmf) 5838462SApril.Chin@Sun.COM file_format="font_pmf" 5848462SApril.Chin@Sun.COM return 0 5858462SApril.Chin@Sun.COM ;; 5868462SApril.Chin@Sun.COM *.ttf | *.otf) 5878462SApril.Chin@Sun.COM file_format="font_ttf" 5888462SApril.Chin@Sun.COM return 0 5898462SApril.Chin@Sun.COM ;; 5908462SApril.Chin@Sun.COM *.pfa | *.pfb) 5918462SApril.Chin@Sun.COM file_format="font_postscript" 5928462SApril.Chin@Sun.COM return 0 5938462SApril.Chin@Sun.COM ;; 5948462SApril.Chin@Sun.COM esac 5958462SApril.Chin@Sun.COM 5968462SApril.Chin@Sun.COM return 1 5978462SApril.Chin@Sun.COM} 5988462SApril.Chin@Sun.COM 5998462SApril.Chin@Sun.COMfunction extract_comments 6008462SApril.Chin@Sun.COM{ 6018462SApril.Chin@Sun.COM set -o errexit 6028462SApril.Chin@Sun.COM 6038462SApril.Chin@Sun.COM nameref records="$1" 6048462SApril.Chin@Sun.COM typeset filename="$2" 6058462SApril.Chin@Sun.COM integer max_num_comments="$3" 6068462SApril.Chin@Sun.COM integer max_filesize_for_scan="$4" 6078462SApril.Chin@Sun.COM 6088462SApril.Chin@Sun.COM typeset datatype="" 6098462SApril.Chin@Sun.COM 6108462SApril.Chin@Sun.COM records[${filename}]=( 6118462SApril.Chin@Sun.COM typeset filename="$filename" 6128462SApril.Chin@Sun.COM 6138462SApril.Chin@Sun.COM typeset fileformat_found="false" # "true" or "false" 6148462SApril.Chin@Sun.COM typeset file_format="" 6158462SApril.Chin@Sun.COM 6168462SApril.Chin@Sun.COM typeset -A hashsum 6178462SApril.Chin@Sun.COM 6188462SApril.Chin@Sun.COM typeset comments_parsed="false" # "true" or "false" 6198462SApril.Chin@Sun.COM typeset -a comments 6208462SApril.Chin@Sun.COM ) 6218462SApril.Chin@Sun.COM 6228462SApril.Chin@Sun.COM records[${filename}].hashsum["md5"]="$(sum -x md5 < "$filename")" 6238462SApril.Chin@Sun.COM records[${filename}].hashsum["sha1"]="$(sum -x sha1 < "$filename")" 6248462SApril.Chin@Sun.COM 6258462SApril.Chin@Sun.COM if get_file_format "$filename" datatype ; then 6268462SApril.Chin@Sun.COM records[${filename}].fileformat_found="true" 6278462SApril.Chin@Sun.COM records[${filename}].file_format="$datatype" 6288462SApril.Chin@Sun.COM else 6298462SApril.Chin@Sun.COM return 1 6308462SApril.Chin@Sun.COM fi 6318462SApril.Chin@Sun.COM 6328462SApril.Chin@Sun.COM case "$datatype" in 6338462SApril.Chin@Sun.COM c_source|imakefile) 6348462SApril.Chin@Sun.COM enumerate_comments_cpp "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \ 6358462SApril.Chin@Sun.COM records[${filename}].comments_parsed=true 6368462SApril.Chin@Sun.COM ;; 6378462SApril.Chin@Sun.COM shell|makefile) 6388462SApril.Chin@Sun.COM enumerate_comments_shell "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \ 6398462SApril.Chin@Sun.COM records[${filename}].comments_parsed=true 6408462SApril.Chin@Sun.COM ;; 6418462SApril.Chin@Sun.COM troff) 6428462SApril.Chin@Sun.COM enumerate_comments_troff "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \ 6438462SApril.Chin@Sun.COM records[${filename}].comments_parsed=true 6448462SApril.Chin@Sun.COM ;; 6458462SApril.Chin@Sun.COM # NOTE: Disabled for now 6468462SApril.Chin@Sun.COM #xml|html|sgml) 6478462SApril.Chin@Sun.COM # enumerate_comments_xml "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \ 6488462SApril.Chin@Sun.COM # records[${filename}].comments_parsed=true 6498462SApril.Chin@Sun.COM # ;; 6508462SApril.Chin@Sun.COM esac 6518462SApril.Chin@Sun.COM 6528462SApril.Chin@Sun.COM return 0 6538462SApril.Chin@Sun.COM} 6548462SApril.Chin@Sun.COM 6558462SApril.Chin@Sun.COM# parse HTTP return code, cookies etc. 6568462SApril.Chin@Sun.COMfunction parse_http_response 6578462SApril.Chin@Sun.COM{ 6588462SApril.Chin@Sun.COM nameref response="$1" 6598462SApril.Chin@Sun.COM typeset h statuscode statusmsg i 6608462SApril.Chin@Sun.COM 6618462SApril.Chin@Sun.COM # we use '\r' as additional IFS to filter the final '\r' 6628462SApril.Chin@Sun.COM IFS=$' \t\r' read -r h statuscode statusmsg # read HTTP/1.[01] <code> 6638462SApril.Chin@Sun.COM [[ "$h" != ~(Eil)HTTP/.* ]] && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; } 6648462SApril.Chin@Sun.COM [[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n" "$0" ; return 1 ; } 6658462SApril.Chin@Sun.COM response.statuscode="$statuscode" 6668462SApril.Chin@Sun.COM response.statusmsg="$statusmsg" 6678462SApril.Chin@Sun.COM 6688462SApril.Chin@Sun.COM # skip remaining headers 6698462SApril.Chin@Sun.COM while IFS='' read -r i ; do 6708462SApril.Chin@Sun.COM [[ "$i" == $'\r' ]] && break 6718462SApril.Chin@Sun.COM 6728462SApril.Chin@Sun.COM # strip '\r' at the end 6738462SApril.Chin@Sun.COM i="${i/~(Er)$'\r'/}" 6748462SApril.Chin@Sun.COM 6758462SApril.Chin@Sun.COM case "$i" in 6768462SApril.Chin@Sun.COM ~(Eli)Content-Type:.*) 6778462SApril.Chin@Sun.COM response.content_type="${i/~(El).*:[[:blank:]]*/}" 6788462SApril.Chin@Sun.COM ;; 6798462SApril.Chin@Sun.COM ~(Eli)Content-Length:[[:blank:]]*[0-9]*) 6808462SApril.Chin@Sun.COM integer response.content_length="${i/~(El).*:[[:blank:]]*/}" 6818462SApril.Chin@Sun.COM ;; 6828462SApril.Chin@Sun.COM ~(Eli)Transfer-Encoding:.*) 6838462SApril.Chin@Sun.COM response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}" 6848462SApril.Chin@Sun.COM ;; 6858462SApril.Chin@Sun.COM esac 6868462SApril.Chin@Sun.COM done 6878462SApril.Chin@Sun.COM 6888462SApril.Chin@Sun.COM return 0 6898462SApril.Chin@Sun.COM} 6908462SApril.Chin@Sun.COM 6918462SApril.Chin@Sun.COMfunction cat_http_body 6928462SApril.Chin@Sun.COM{ 6938462SApril.Chin@Sun.COM typeset emode="$1" 6948462SApril.Chin@Sun.COM typeset hexchunksize="0" 6958462SApril.Chin@Sun.COM integer chunksize=0 6968462SApril.Chin@Sun.COM 6978462SApril.Chin@Sun.COM if [[ "${emode}" == "chunked" ]] ; then 6988462SApril.Chin@Sun.COM while IFS=$'\r' read hexchunksize && 699*12068SRoger.Faulkner@Oracle.COM [[ "${hexchunksize}" == ~(Elri)[0-9abcdef]+ ]] && 700*12068SRoger.Faulkner@Oracle.COM (( chunksize=$( printf "16#%s\n" "${hexchunksize}" ) )) && (( chunksize > 0 )) ; do 7018462SApril.Chin@Sun.COM dd bs=1 count="${chunksize}" 2>/dev/null 7028462SApril.Chin@Sun.COM done 7038462SApril.Chin@Sun.COM else 7048462SApril.Chin@Sun.COM cat 7058462SApril.Chin@Sun.COM fi 7068462SApril.Chin@Sun.COM 7078462SApril.Chin@Sun.COM return 0 7088462SApril.Chin@Sun.COM} 7098462SApril.Chin@Sun.COM 71010898Sroland.mainz@nrubsig.orgfunction cat_url 7118462SApril.Chin@Sun.COM{ 7128462SApril.Chin@Sun.COM typeset protocol="${1%://*}" 7138462SApril.Chin@Sun.COM typeset path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html" 71410898Sroland.mainz@nrubsig.org 71510898Sroland.mainz@nrubsig.org if [[ "${protocol}" == "file" ]] ; then 71610898Sroland.mainz@nrubsig.org cat "${path1}" 71710898Sroland.mainz@nrubsig.org return $? 71810898Sroland.mainz@nrubsig.org elif [[ "${protocol}" == ~(Elr)http(|s) ]] ; then 71910898Sroland.mainz@nrubsig.org typeset host="${path1%%/*}" 72010898Sroland.mainz@nrubsig.org typeset path="${path1#*/}" 72110898Sroland.mainz@nrubsig.org typeset port="${host##*:}" 7228462SApril.Chin@Sun.COM 72310898Sroland.mainz@nrubsig.org integer netfd 72410898Sroland.mainz@nrubsig.org compound httpresponse # http response 7258462SApril.Chin@Sun.COM 72610898Sroland.mainz@nrubsig.org # If URL did not contain a port number in the host part then look at the 72710898Sroland.mainz@nrubsig.org # protocol to get the port number 72810898Sroland.mainz@nrubsig.org if [[ "${port}" == "${host}" ]] ; then 72910898Sroland.mainz@nrubsig.org case "${protocol}" in 73010898Sroland.mainz@nrubsig.org "http") port=80 ;; 73110898Sroland.mainz@nrubsig.org "https") port=443 ;; 73210898Sroland.mainz@nrubsig.org *) port="$(getent services "${protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;; 73310898Sroland.mainz@nrubsig.org esac 73410898Sroland.mainz@nrubsig.org else 73510898Sroland.mainz@nrubsig.org host="${host%:*}" 73610898Sroland.mainz@nrubsig.org fi 7378462SApril.Chin@Sun.COM 73810898Sroland.mainz@nrubsig.org printmsg "protocol=${protocol} port=${port} host=${host} path=${path}" 7398462SApril.Chin@Sun.COM 74010898Sroland.mainz@nrubsig.org # prechecks 74110898Sroland.mainz@nrubsig.org [[ "${protocol}" != "" ]] || { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; } 74210898Sroland.mainz@nrubsig.org [[ "${port}" != "" ]] || { print -u2 -f "%s: port not set.\n" "$0" ; return 1 ; } 74310898Sroland.mainz@nrubsig.org [[ "${host}" != "" ]] || { print -u2 -f "%s: host not set.\n" "$0" ; return 1 ; } 74410898Sroland.mainz@nrubsig.org [[ "${path}" != "" ]] || { print -u2 -f "%s: path not set.\n" "$0" ; return 1 ; } 7458462SApril.Chin@Sun.COM 74610898Sroland.mainz@nrubsig.org # open TCP channel 74710898Sroland.mainz@nrubsig.org if [[ "${protocol}" == "https" ]] ; then 74810898Sroland.mainz@nrubsig.org compound sslfifo 74910898Sroland.mainz@nrubsig.org sslfifo.dir="$(mktemp -d)" 75010898Sroland.mainz@nrubsig.org sslfifo.in="${sslfifo.dir}/in" 75110898Sroland.mainz@nrubsig.org sslfifo.out="${sslfifo.dir}/out" 75210898Sroland.mainz@nrubsig.org 75310898Sroland.mainz@nrubsig.org # register an EXIT trap and use "errexit" to leave it at the first error 75410898Sroland.mainz@nrubsig.org # (this saves lots of if/fi tests for error checking) 75510898Sroland.mainz@nrubsig.org trap "rm -r \"${sslfifo.dir}\"" EXIT 75610898Sroland.mainz@nrubsig.org set -o errexit 75710898Sroland.mainz@nrubsig.org 75810898Sroland.mainz@nrubsig.org mkfifo "${sslfifo.in}" "${sslfifo.out}" 75910898Sroland.mainz@nrubsig.org 76010898Sroland.mainz@nrubsig.org # create async openssl child to handle https 76110898Sroland.mainz@nrubsig.org openssl s_client -quiet -connect "${host}:${port}" <"${sslfifo.in}" >>"${sslfifo.out}" & 7628462SApril.Chin@Sun.COM 76310898Sroland.mainz@nrubsig.org # send HTTP request 76410898Sroland.mainz@nrubsig.org request="GET /${path} HTTP/1.1\r\n" 76510898Sroland.mainz@nrubsig.org request+="Host: ${host}\r\n" 766*12068SRoger.Faulkner@Oracle.COM request+="User-Agent: crawlsrccomments/ksh93(ssl) (2010-03-27; $(uname -s -r -p))\r\n" 76710898Sroland.mainz@nrubsig.org request+="Connection: close\r\n" 76810898Sroland.mainz@nrubsig.org print -n -- "${request}\r\n" >> "${sslfifo.in}" 76910898Sroland.mainz@nrubsig.org 77010898Sroland.mainz@nrubsig.org # collect response and send it to stdout 77110898Sroland.mainz@nrubsig.org { 77210898Sroland.mainz@nrubsig.org parse_http_response httpresponse 77310898Sroland.mainz@nrubsig.org cat_http_body "${httpresponse.transfer_encoding}" 77410898Sroland.mainz@nrubsig.org } <"${sslfifo.out}" 77510898Sroland.mainz@nrubsig.org 77610898Sroland.mainz@nrubsig.org wait || { print -u2 -f "%s: openssl failed.\n" ; exit 1 ; } 77710898Sroland.mainz@nrubsig.org 77810898Sroland.mainz@nrubsig.org return 0 77910898Sroland.mainz@nrubsig.org else 78010898Sroland.mainz@nrubsig.org redirect {netfd}<> "/dev/tcp/${host}/${port}" 78110898Sroland.mainz@nrubsig.org (( $? != 0 )) && { print -u2 -f "%s: Could not open %s\n" "$0" "${1}" ; return 1 ; } 78210898Sroland.mainz@nrubsig.org 78310898Sroland.mainz@nrubsig.org # send HTTP request 78410898Sroland.mainz@nrubsig.org request="GET /${path} HTTP/1.1\r\n" 78510898Sroland.mainz@nrubsig.org request+="Host: ${host}\r\n" 786*12068SRoger.Faulkner@Oracle.COM request+="User-Agent: crawlsrccomments/ksh93 (2010-03-27; $(uname -s -r -p))\r\n" 78710898Sroland.mainz@nrubsig.org request+="Connection: close\r\n" 78810898Sroland.mainz@nrubsig.org print -n -- "${request}\r\n" >&${netfd} 7898462SApril.Chin@Sun.COM 79010898Sroland.mainz@nrubsig.org # collect response and send it to stdout 79110898Sroland.mainz@nrubsig.org parse_http_response httpresponse <&${netfd} 79210898Sroland.mainz@nrubsig.org cat_http_body "${httpresponse.transfer_encoding}" <&${netfd} 7938462SApril.Chin@Sun.COM 79410898Sroland.mainz@nrubsig.org # close connection 79510898Sroland.mainz@nrubsig.org redirect {netfd}<&- 79610898Sroland.mainz@nrubsig.org 79710898Sroland.mainz@nrubsig.org return 0 79810898Sroland.mainz@nrubsig.org fi 79910898Sroland.mainz@nrubsig.org else 80010898Sroland.mainz@nrubsig.org return 1 80110898Sroland.mainz@nrubsig.org fi 80210898Sroland.mainz@nrubsig.org # notreached 8038462SApril.Chin@Sun.COM} 8048462SApril.Chin@Sun.COM 8058462SApril.Chin@Sun.COMfunction print_stats 8068462SApril.Chin@Sun.COM{ 8078462SApril.Chin@Sun.COM set -o errexit 8088462SApril.Chin@Sun.COM 8098462SApril.Chin@Sun.COM # gather some statistics 81010898Sroland.mainz@nrubsig.org compound stats=( 8118462SApril.Chin@Sun.COM integer files_with_comments=0 8128462SApril.Chin@Sun.COM integer files_without_comments=0 8138462SApril.Chin@Sun.COM 8148462SApril.Chin@Sun.COM integer files_without_known_format=0 8158462SApril.Chin@Sun.COM 8168462SApril.Chin@Sun.COM integer files_with_license_info=0 8178462SApril.Chin@Sun.COM integer files_without_license_info=0 8188462SApril.Chin@Sun.COM 8198462SApril.Chin@Sun.COM integer total_num_files=0 8208462SApril.Chin@Sun.COM ) 8218462SApril.Chin@Sun.COM 8228462SApril.Chin@Sun.COM for i in $(printf "%s\n" "${!records[@]}" | sort) ; do 8238462SApril.Chin@Sun.COM if "${records[$i].comments_parsed}" ; then 8248462SApril.Chin@Sun.COM (( stats.files_with_comments++ )) 8258462SApril.Chin@Sun.COM else 8268462SApril.Chin@Sun.COM (( stats.files_without_comments++ )) 8278462SApril.Chin@Sun.COM fi 8288462SApril.Chin@Sun.COM 8298462SApril.Chin@Sun.COM if ! "${records[$i].fileformat_found}" ; then 8308462SApril.Chin@Sun.COM (( stats.files_without_known_format++ )) 8318462SApril.Chin@Sun.COM fi 8328462SApril.Chin@Sun.COM 8338462SApril.Chin@Sun.COM if "${records[$i].license_info_found}" ; then 8348462SApril.Chin@Sun.COM (( stats.files_with_license_info++ )) 8358462SApril.Chin@Sun.COM else 8368462SApril.Chin@Sun.COM (( stats.files_without_license_info++ )) 8378462SApril.Chin@Sun.COM fi 8388462SApril.Chin@Sun.COM 8398462SApril.Chin@Sun.COM (( stats.total_num_files++ )) 8408462SApril.Chin@Sun.COM done 8418462SApril.Chin@Sun.COM 84210898Sroland.mainz@nrubsig.org print -v stats 8438462SApril.Chin@Sun.COM return 0 8448462SApril.Chin@Sun.COM} 8458462SApril.Chin@Sun.COM 8468462SApril.Chin@Sun.COM 8478462SApril.Chin@Sun.COMfunction print_comments_plain 8488462SApril.Chin@Sun.COM{ 8498462SApril.Chin@Sun.COM set -o errexit 8508462SApril.Chin@Sun.COM 8518462SApril.Chin@Sun.COM nameref records=$1 8528462SApril.Chin@Sun.COM nameref options=$2 8538462SApril.Chin@Sun.COM typeset i j 8548462SApril.Chin@Sun.COM 8558462SApril.Chin@Sun.COM for i in $(printf "%s\n" "${!records[@]}" | sort) ; do 8568462SApril.Chin@Sun.COM nameref node=records[$i] 8578462SApril.Chin@Sun.COM 8588462SApril.Chin@Sun.COM if [[ "${options.filepattern.accept}" != "" ]] && \ 8598462SApril.Chin@Sun.COM [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then 8608462SApril.Chin@Sun.COM continue 8618462SApril.Chin@Sun.COM fi 8628462SApril.Chin@Sun.COM if [[ "${options.filepattern.reject}" != "" ]] && \ 8638462SApril.Chin@Sun.COM [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then 8648462SApril.Chin@Sun.COM continue 8658462SApril.Chin@Sun.COM fi 8668462SApril.Chin@Sun.COM 8678462SApril.Chin@Sun.COM node.license_info_found=false 8688462SApril.Chin@Sun.COM 8698462SApril.Chin@Sun.COM if ! "${node.comments_parsed}" ; then 8708462SApril.Chin@Sun.COM continue 8718462SApril.Chin@Sun.COM fi 8728462SApril.Chin@Sun.COM 8738462SApril.Chin@Sun.COM for j in "${!node.comments[@]}" ; do 8748462SApril.Chin@Sun.COM typeset s="${node.comments[$j]}" 8758462SApril.Chin@Sun.COM typeset match=false 8768462SApril.Chin@Sun.COM 8778462SApril.Chin@Sun.COM if [[ "${options.commentpattern.accept}" != "" ]] && \ 8788462SApril.Chin@Sun.COM [[ "$s" == ${options.commentpattern.accept} ]] ; then 8798462SApril.Chin@Sun.COM match=true 8808462SApril.Chin@Sun.COM fi 8818462SApril.Chin@Sun.COM if [[ "${options.commentpattern.reject}" != "" ]] && \ 8828462SApril.Chin@Sun.COM [[ "$s" == ${options.commentpattern.reject} ]] ; then 8838462SApril.Chin@Sun.COM match=false 8848462SApril.Chin@Sun.COM fi 8858462SApril.Chin@Sun.COM 8868462SApril.Chin@Sun.COM if "${match}" ; then 8878462SApril.Chin@Sun.COM printf "\f#### filename='%s',\tcomment=%s\n" "${node.filename}" "$j" 8888462SApril.Chin@Sun.COM printf "%s\n" "$s" 8898462SApril.Chin@Sun.COM node.license_info_found=true 8908462SApril.Chin@Sun.COM fi 8918462SApril.Chin@Sun.COM done 8928462SApril.Chin@Sun.COM 8938462SApril.Chin@Sun.COM if ! "${node.license_info_found}" ; then 8948462SApril.Chin@Sun.COM printf "## no match found in '%s'," "${node.filename}" 8958462SApril.Chin@Sun.COM printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \ 8968462SApril.Chin@Sun.COM "${node.comments_parsed}" \ 8978462SApril.Chin@Sun.COM "${node.fileformat_found}" \ 8988462SApril.Chin@Sun.COM "${node.file_format}" 8998462SApril.Chin@Sun.COM fi 9008462SApril.Chin@Sun.COM done 9018462SApril.Chin@Sun.COM 9028462SApril.Chin@Sun.COM return 0 9038462SApril.Chin@Sun.COM} 9048462SApril.Chin@Sun.COM 9058462SApril.Chin@Sun.COMfunction print_comments_duplicates_compressed 9068462SApril.Chin@Sun.COM{ 9078462SApril.Chin@Sun.COM set -o errexit 9088462SApril.Chin@Sun.COM 9098462SApril.Chin@Sun.COM nameref records=$1 9108462SApril.Chin@Sun.COM nameref options=$2 9118462SApril.Chin@Sun.COM typeset i j 9128462SApril.Chin@Sun.COM typeset -A hashed_comments 9138462SApril.Chin@Sun.COM integer num_hashed_comments 9148462SApril.Chin@Sun.COM 9158462SApril.Chin@Sun.COM for i in $(printf "%s\n" "${!records[@]}" | sort) ; do 9168462SApril.Chin@Sun.COM nameref node=records[$i] 9178462SApril.Chin@Sun.COM 9188462SApril.Chin@Sun.COM if [[ "${options.filepattern.accept}" != "" ]] && \ 9198462SApril.Chin@Sun.COM [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then 9208462SApril.Chin@Sun.COM continue 9218462SApril.Chin@Sun.COM fi 9228462SApril.Chin@Sun.COM if [[ "${options.filepattern.reject}" != "" ]] && \ 9238462SApril.Chin@Sun.COM [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then 9248462SApril.Chin@Sun.COM continue 9258462SApril.Chin@Sun.COM fi 9268462SApril.Chin@Sun.COM 9278462SApril.Chin@Sun.COM node.license_info_found=false 9288462SApril.Chin@Sun.COM 9298462SApril.Chin@Sun.COM if ! "${node.comments_parsed}" ; then 9308462SApril.Chin@Sun.COM continue 9318462SApril.Chin@Sun.COM fi 9328462SApril.Chin@Sun.COM 9338462SApril.Chin@Sun.COM for j in "${!node.comments[@]}" ; do 9348462SApril.Chin@Sun.COM typeset s="${node.comments[$j]}" 9358462SApril.Chin@Sun.COM typeset match=false 9368462SApril.Chin@Sun.COM 9378462SApril.Chin@Sun.COM if [[ "${options.commentpattern.accept}" != "" ]] && \ 9388462SApril.Chin@Sun.COM [[ "$s" == ${options.commentpattern.accept} ]] ; then 9398462SApril.Chin@Sun.COM match=true 9408462SApril.Chin@Sun.COM fi 9418462SApril.Chin@Sun.COM if [[ "${options.commentpattern.reject}" != "" ]] && \ 9428462SApril.Chin@Sun.COM [[ "$s" == ${options.commentpattern.reject} ]] ; then 9438462SApril.Chin@Sun.COM match=false 9448462SApril.Chin@Sun.COM fi 9458462SApril.Chin@Sun.COM 9468462SApril.Chin@Sun.COM 9478462SApril.Chin@Sun.COM if "${match}" ; then 9488462SApril.Chin@Sun.COM typeset -l hashstring # lowercase 9498462SApril.Chin@Sun.COM 9508462SApril.Chin@Sun.COM # compress the comment (e.g. convert whiteapces and '.,:;()"' to newline characters) ... 9518462SApril.Chin@Sun.COM hashstring="${s//+([\n\r\t\v*#.,:;\(\)\"[:space:][:blank:]])/${ch.newline}}" 9528462SApril.Chin@Sun.COM # ... and then create a MD5 hash from this string 9538462SApril.Chin@Sun.COM hash="$(sum -x md5 <<<"${hashstring}")" 9548462SApril.Chin@Sun.COM 9558462SApril.Chin@Sun.COM nameref hc_node=hashed_comments[${hash}] 9568462SApril.Chin@Sun.COM 9578462SApril.Chin@Sun.COM if [[ "${hc_node}" == "" ]] ; then 9588462SApril.Chin@Sun.COM # build node if there isn't one yet 9598462SApril.Chin@Sun.COM typeset -a hc_node.fileids 9608462SApril.Chin@Sun.COM typeset hc_node.comment="$s" 9618462SApril.Chin@Sun.COM fi 9628462SApril.Chin@Sun.COM 9638462SApril.Chin@Sun.COM hc_node.fileids+=( "$(printf "%s (md5='%s', sha1='%s')\n" "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}")" ) 9648462SApril.Chin@Sun.COM 9658462SApril.Chin@Sun.COM node.license_info_found=true 9668462SApril.Chin@Sun.COM fi 9678462SApril.Chin@Sun.COM done 9688462SApril.Chin@Sun.COM 9698462SApril.Chin@Sun.COM if ! "${node.license_info_found}" ; then 9708462SApril.Chin@Sun.COM printf "## no match found in " 9718462SApril.Chin@Sun.COM printf "%s (md5='%s', sha1='%s'), " "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}" 9728462SApril.Chin@Sun.COM printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \ 9738462SApril.Chin@Sun.COM "${node.comments_parsed}" \ 9748462SApril.Chin@Sun.COM "${node.fileformat_found}" \ 9758462SApril.Chin@Sun.COM "${node.file_format}" 9768462SApril.Chin@Sun.COM fi 9778462SApril.Chin@Sun.COM done 9788462SApril.Chin@Sun.COM 9798462SApril.Chin@Sun.COM # print comments and all fileids (filename+hash sums) which include this comment 9808462SApril.Chin@Sun.COM for i in "${!hashed_comments[@]}" ; do 9818462SApril.Chin@Sun.COM printf "\f## The comment (ID=%s) ..." "${i}" 9828462SApril.Chin@Sun.COM printf "\n-- snip --" 9838462SApril.Chin@Sun.COM printf "\n%s" "${hashed_comments[${i}].comment}" 9848462SApril.Chin@Sun.COM printf "\n-- snip --" 9858462SApril.Chin@Sun.COM printf "\n... applies to the following files:\n" 9868462SApril.Chin@Sun.COM printf "\t%s\n" "${hashed_comments[${i}].fileids[@]}" # printf repeats the format string for each array memeber 9878462SApril.Chin@Sun.COM done 9888462SApril.Chin@Sun.COM 9898462SApril.Chin@Sun.COM return 0 9908462SApril.Chin@Sun.COM} 9918462SApril.Chin@Sun.COM 9928462SApril.Chin@Sun.COMfunction do_crawl 9938462SApril.Chin@Sun.COM{ 9948462SApril.Chin@Sun.COM set -o errexit 9958462SApril.Chin@Sun.COM 99610898Sroland.mainz@nrubsig.org compound options=( 9978462SApril.Chin@Sun.COM integer max_filesize_for_scan=$((256*1024)) 9988462SApril.Chin@Sun.COM integer max_num_comments=$((2**62)) # FIXME: This should be "+Inf" (=Infinite) 9998462SApril.Chin@Sun.COM ) 10008462SApril.Chin@Sun.COM 10018462SApril.Chin@Sun.COM shift 10028462SApril.Chin@Sun.COM while getopts -a "${progname}" "${do_crawl_usage}" OPT "$@" ; do 10038462SApril.Chin@Sun.COM printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|" 10048462SApril.Chin@Sun.COM case ${OPT} in 10058462SApril.Chin@Sun.COM S) options.max_filesize_for_scan="${OPTARG}" ;; 10068462SApril.Chin@Sun.COM N) options.max_num_comments="${OPTARG}" ;; 10078462SApril.Chin@Sun.COM *) usage do_crawl_usage ;; 10088462SApril.Chin@Sun.COM esac 10098462SApril.Chin@Sun.COM done 10108462SApril.Chin@Sun.COM shift $((OPTIND-1)) 10118462SApril.Chin@Sun.COM 101210898Sroland.mainz@nrubsig.org compound scan=( 10138462SApril.Chin@Sun.COM typeset -A records 10148462SApril.Chin@Sun.COM ) 10158462SApril.Chin@Sun.COM 10168462SApril.Chin@Sun.COM # read filenames from stdin 10178462SApril.Chin@Sun.COM while read i ; do 10188462SApril.Chin@Sun.COM printf "## scanning %s ...\n" "$i" 10198462SApril.Chin@Sun.COM extract_comments scan.records "$i" ${options.max_num_comments} ${options.max_filesize_for_scan} || true 10208462SApril.Chin@Sun.COM done 10218462SApril.Chin@Sun.COM 10228462SApril.Chin@Sun.COM # print compound variable array (we strip the "typeset -A records" for now) 102310898Sroland.mainz@nrubsig.org print -v scan >"crawlsrccomments_extracted_comments.cpv" 10248462SApril.Chin@Sun.COM 10258462SApril.Chin@Sun.COM print "# Wrote results to crawlsrccomments_extracted_comments.cpv" 10268462SApril.Chin@Sun.COM 10278462SApril.Chin@Sun.COM return 0 10288462SApril.Chin@Sun.COM} 10298462SApril.Chin@Sun.COM 10308462SApril.Chin@Sun.COMfunction do_getcomments 10318462SApril.Chin@Sun.COM{ 10328462SApril.Chin@Sun.COM set -o errexit 10338462SApril.Chin@Sun.COM 10348462SApril.Chin@Sun.COM # vars 103510898Sroland.mainz@nrubsig.org compound scan 10368462SApril.Chin@Sun.COM typeset database 10378462SApril.Chin@Sun.COM typeset tmp 10388462SApril.Chin@Sun.COM 103910898Sroland.mainz@nrubsig.org compound options=( 10408462SApril.Chin@Sun.COM typeset database="crawlsrccomments_extracted_comments.cpv" 10418462SApril.Chin@Sun.COM 10428462SApril.Chin@Sun.COM typeset print_stats=false 10438462SApril.Chin@Sun.COM typeset zapduplicates=false 104410898Sroland.mainz@nrubsig.org compound filepattern=( 10458462SApril.Chin@Sun.COM typeset accept="*" 10468462SApril.Chin@Sun.COM typeset reject="" 10478462SApril.Chin@Sun.COM ) 104810898Sroland.mainz@nrubsig.org compound commentpattern=( 10498462SApril.Chin@Sun.COM typeset accept="~(Ei)(license|copyright)" 10508462SApril.Chin@Sun.COM typeset reject="" 10518462SApril.Chin@Sun.COM ) 10528462SApril.Chin@Sun.COM ) 10538462SApril.Chin@Sun.COM 10548462SApril.Chin@Sun.COM shift 10558462SApril.Chin@Sun.COM while getopts -a "${progname}" "${do_getcomments_usage}" OPT "$@" ; do 10568462SApril.Chin@Sun.COM # printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|" 10578462SApril.Chin@Sun.COM case ${OPT} in 10588462SApril.Chin@Sun.COM c) options.commentpattern.accept="${OPTARG}" ;; 10598462SApril.Chin@Sun.COM C) options.commentpattern.reject="${OPTARG}" ;; 10608462SApril.Chin@Sun.COM D) options.database="${OPTARG}" ;; 10618462SApril.Chin@Sun.COM l) options.filepattern.accept="${OPTARG}" ;; 10628462SApril.Chin@Sun.COM L) options.filepattern.reject="${OPTARG}" ;; 10638462SApril.Chin@Sun.COM S) options.print_stats=true ;; 10648462SApril.Chin@Sun.COM +S) options.print_stats=false ;; 10658462SApril.Chin@Sun.COM Z) options.zapduplicates=true ;; 10668462SApril.Chin@Sun.COM +Z) options.zapduplicates=false ;; 10678462SApril.Chin@Sun.COM *) usage do_getcomments_usage ;; 10688462SApril.Chin@Sun.COM esac 10698462SApril.Chin@Sun.COM done 10708462SApril.Chin@Sun.COM shift $((OPTIND-1)) 10718462SApril.Chin@Sun.COM 10728462SApril.Chin@Sun.COM # array of temporary files which should be cleaned-up upon exit 10738462SApril.Chin@Sun.COM typeset -a tmpfiles 10748462SApril.Chin@Sun.COM trap 'set -o errexit ; print -u2 "# Cleaning up..." ; ((${#tmpfiles[@]} > 0)) && rm -- "${tmpfiles[@]}" ; print -u2 "# Done."' EXIT 10758462SApril.Chin@Sun.COM 10768462SApril.Chin@Sun.COM # Support for HTTP URLs 107710898Sroland.mainz@nrubsig.org if [[ "${options.database}" == ~(El)(http|https)://.* ]] ; then 107810898Sroland.mainz@nrubsig.org database="/tmp/extract_license_cat_url_${PPID}_$$.tmp" 10798462SApril.Chin@Sun.COM tmpfiles+=( "${database}" ) 10808462SApril.Chin@Sun.COM print -u2 "# Loading URL..." 108110898Sroland.mainz@nrubsig.org cat_url "${options.database}" >"${database}" 10828462SApril.Chin@Sun.COM print -u2 "# Loading URL done." 10838462SApril.Chin@Sun.COM else 10848462SApril.Chin@Sun.COM database="${options.database}" 10858462SApril.Chin@Sun.COM fi 10868462SApril.Chin@Sun.COM 10878462SApril.Chin@Sun.COM if [[ ! -r "${database}" ]] ; then 10888462SApril.Chin@Sun.COM fatal_error "Can't read ${database}." 10898462SApril.Chin@Sun.COM fi 10908462SApril.Chin@Sun.COM 10918462SApril.Chin@Sun.COM # Support for compressed database files 10928462SApril.Chin@Sun.COM case "$(LC_ALL=C /usr/bin/file "${database}")" in 10938462SApril.Chin@Sun.COM *bzip2*) 10948462SApril.Chin@Sun.COM tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp" 10958462SApril.Chin@Sun.COM tmpfiles+=( "${tmp}" ) 10968462SApril.Chin@Sun.COM print -u2 "# Uncompressing data (bzip2) ..." 10978462SApril.Chin@Sun.COM bzcat <"${database}" >"${tmp}" 10988462SApril.Chin@Sun.COM print -u2 "# Uncompression done." 10998462SApril.Chin@Sun.COM database="${tmp}" 11008462SApril.Chin@Sun.COM ;; 11018462SApril.Chin@Sun.COM *gzip*) 11028462SApril.Chin@Sun.COM tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp" 11038462SApril.Chin@Sun.COM tmpfiles+=( "${tmp}" ) 11048462SApril.Chin@Sun.COM print -u2 "# Uncompressing data (gzip) ..." 11058462SApril.Chin@Sun.COM gunzip -c <"${database}" >"${tmp}" 11068462SApril.Chin@Sun.COM print -u2 "# Uncompression done." 11078462SApril.Chin@Sun.COM database="${tmp}" 11088462SApril.Chin@Sun.COM ;; 11098462SApril.Chin@Sun.COM esac 11108462SApril.Chin@Sun.COM 11118462SApril.Chin@Sun.COM # Read compound variable which contain all recorded comments 11128462SApril.Chin@Sun.COM print -u2 "# reading records..." 111310898Sroland.mainz@nrubsig.org read -C scan <"${database}" || fatal_error 'Error reading data.' 11148462SApril.Chin@Sun.COM print -u2 -f "# reading %d records done.\n" "${#scan.records[@]}" 11158462SApril.Chin@Sun.COM 11168462SApril.Chin@Sun.COM # print comments 11178462SApril.Chin@Sun.COM print -u2 "# processing data..." 11188462SApril.Chin@Sun.COM print "## comments start:" 11198462SApril.Chin@Sun.COM if "${options.zapduplicates}" ; then 11208462SApril.Chin@Sun.COM print_comments_duplicates_compressed scan.records options 11218462SApril.Chin@Sun.COM else 11228462SApril.Chin@Sun.COM print_comments_plain scan.records options 11238462SApril.Chin@Sun.COM fi 11248462SApril.Chin@Sun.COM print "## comments end" 11258462SApril.Chin@Sun.COM print -u2 "# processing data done." 11268462SApril.Chin@Sun.COM 11278462SApril.Chin@Sun.COM if "${options.print_stats}" ; then 11288462SApril.Chin@Sun.COM print_stats 11298462SApril.Chin@Sun.COM fi 11308462SApril.Chin@Sun.COM 11318462SApril.Chin@Sun.COM return 0 11328462SApril.Chin@Sun.COM} 11338462SApril.Chin@Sun.COM 11348462SApril.Chin@Sun.COMfunction usage 11358462SApril.Chin@Sun.COM{ 11368462SApril.Chin@Sun.COM nameref usagemsg=$1 11378462SApril.Chin@Sun.COM OPTIND=0 11388462SApril.Chin@Sun.COM getopts -a "${progname}" "${usagemsg}" OPT '-?' 11398462SApril.Chin@Sun.COM exit 2 11408462SApril.Chin@Sun.COM} 11418462SApril.Chin@Sun.COM 11428462SApril.Chin@Sun.COMtypeset -r do_getcomments_usage=$'+ 1143*12068SRoger.Faulkner@Oracle.COM[-?\n@(#)\$Id: getcomments (Roland Mainz) 2010-03-27 \$\n] 11448462SApril.Chin@Sun.COM[-author?Roland Mainz <roland.mainz@sun.com>] 1145*12068SRoger.Faulkner@Oracle.COM[-author?Roland Mainz <roland.mainz@nrubsig.org>] 11468462SApril.Chin@Sun.COM[+NAME?getcomments - extract license information from source files] 11478462SApril.Chin@Sun.COM[+DESCRIPTION?\bgetcomments\b is a small utilty script which extracts 11488462SApril.Chin@Sun.COM license information from the "\bgetcomments\b"-database 11498462SApril.Chin@Sun.COM file created by \bcrawl\b. The script allows various 11508462SApril.Chin@Sun.COM filters (see options below) to be applied on the database] 11518462SApril.Chin@Sun.COM[+?The license extraction is done in two steps - first a crawler script 11528462SApril.Chin@Sun.COM called \bcrawl\b will scan all source files, extract 11538462SApril.Chin@Sun.COM the comments and stores this information in a "database" file called 11548462SApril.Chin@Sun.COM "crawlsrccomments_extracted_comments.cpv" and then \bextract_license\b allows 11558462SApril.Chin@Sun.COM queries on this database.] 115610898Sroland.mainz@nrubsig.org[D:database?Database file for input (either file, http:// or https://-URL).]:[database] 11578462SApril.Chin@Sun.COM[l:acceptfilepattern?Process only files which match pattern.]:[pattern] 11588462SApril.Chin@Sun.COM[L:rejectfilepattern?Process only files which do not match pattern.]:[pattern] 11598462SApril.Chin@Sun.COM[c:acceptcommentpattern?Match comments which match pattern. Defaults to ~(Ei)(license|copyright)]:[pattern] 11608462SApril.Chin@Sun.COM[C:rejectcommentpattern?Discard comments which match pattern. Defaults to ""]:[pattern] 11618462SApril.Chin@Sun.COM[S:stats?Print statistics.] 11628462SApril.Chin@Sun.COM[Z:zapsimilar?Combine similar/duplicate comments in the report.] 11638462SApril.Chin@Sun.COM[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)] 11648462SApril.Chin@Sun.COM' 11658462SApril.Chin@Sun.COM 11668462SApril.Chin@Sun.COMtypeset -r do_crawl_usage=$'+ 1167*12068SRoger.Faulkner@Oracle.COM[-?\n@(#)\$Id: crawl (Roland Mainz) 2010-03-27 \$\n] 11688462SApril.Chin@Sun.COM[-author?Roland Mainz <roland.mainz@sun.com>] 1169*12068SRoger.Faulkner@Oracle.COM[-author?Roland Mainz <roland.mainz@nrubsig.org>] 11708462SApril.Chin@Sun.COM[+NAME?crawl - crawl comment information from source files] 11718462SApril.Chin@Sun.COM[+DESCRIPTION?\bcrawl\b is a small utilty script which reads 11728462SApril.Chin@Sun.COM a list of source code files from stdin, determinates the type of 11738462SApril.Chin@Sun.COM syntax used by these files and then extracts 11748462SApril.Chin@Sun.COM comments from the source code and stores this information into a 11758462SApril.Chin@Sun.COM "database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then 11768462SApril.Chin@Sun.COM be processed by \bextract_license\b or similar processing tools.] 11778462SApril.Chin@Sun.COM[S:scanmaxcharacters?Scan a maximum number of numchars characters for comments. 11788462SApril.Chin@Sun.COM Defaults to 256K characters.]:[numchars] 11798462SApril.Chin@Sun.COM[N:maxnumcomments?Maximum numbers of comments to crawl. Defaults to "+Infinite"]:[numcomments] 11808462SApril.Chin@Sun.COM[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)] 11818462SApril.Chin@Sun.COM' 11828462SApril.Chin@Sun.COM 11838462SApril.Chin@Sun.COMtypeset -r crawlsrccomments_usage=$'+ 1184*12068SRoger.Faulkner@Oracle.COM[-?\n@(#)\$Id: crawlsrccomments (Roland Mainz) 2010-03-27 \$\n] 11858462SApril.Chin@Sun.COM[-author?Roland Mainz <roland.mainz@sun.com>] 1186*12068SRoger.Faulkner@Oracle.COM[-author?Roland Mainz <roland.mainz@nrubsig.org>] 11878462SApril.Chin@Sun.COM[+NAME?crawlsrccomments - extract and filter comment information from source files] 11888462SApril.Chin@Sun.COM[+DESCRIPTION?\bcrawlsrccomments\b is a small utilty script which reads 11898462SApril.Chin@Sun.COM a list of source code files from stdin, determinates the type of 11908462SApril.Chin@Sun.COM syntax used by these files and then extracts 11918462SApril.Chin@Sun.COM comments from the source code and stores this information into a 11928462SApril.Chin@Sun.COM "database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then 11938462SApril.Chin@Sun.COM be processed by \bextract_license\b or similar processing tools.] 11948462SApril.Chin@Sun.COM 11958462SApril.Chin@Sun.COM[crawl|getcomments] options 11968462SApril.Chin@Sun.COM 11978462SApril.Chin@Sun.COM[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)] 11988462SApril.Chin@Sun.COM' 11998462SApril.Chin@Sun.COM 12008462SApril.Chin@Sun.COM 12018462SApril.Chin@Sun.COM# program start 12028462SApril.Chin@Sun.COMbuiltin basename 12038462SApril.Chin@Sun.COMbuiltin cat 12048462SApril.Chin@Sun.COMbuiltin date 12058462SApril.Chin@Sun.COMbuiltin uname 12068462SApril.Chin@Sun.COMbuiltin rm 12078462SApril.Chin@Sun.COMbuiltin sum || fatal_error "sum builtin not found." 12088462SApril.Chin@Sun.COM 12098462SApril.Chin@Sun.COM# exit at the first error we hit 12108462SApril.Chin@Sun.COMset -o errexit 12118462SApril.Chin@Sun.COM 12128462SApril.Chin@Sun.COMtypeset progname="${ basename "${0}" ; }" 12138462SApril.Chin@Sun.COM 12148462SApril.Chin@Sun.COMwhile getopts -a "${progname}" "${crawlsrccomments_usage}" OPT ; do 12158462SApril.Chin@Sun.COM # printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|" 12168462SApril.Chin@Sun.COM case ${OPT} in 12178462SApril.Chin@Sun.COM *) usage crawlsrccomments_usage ;; 12188462SApril.Chin@Sun.COM esac 12198462SApril.Chin@Sun.COMdone 12208462SApril.Chin@Sun.COMshift $((OPTIND-1)) 12218462SApril.Chin@Sun.COM 12228462SApril.Chin@Sun.COMtypeset cmd="$1" 12238462SApril.Chin@Sun.COM 12248462SApril.Chin@Sun.COMcase "$cmd" in 12258462SApril.Chin@Sun.COM "crawl") 12268462SApril.Chin@Sun.COM progname+=" ${cmd}" 12278462SApril.Chin@Sun.COM do_crawl "$@" 12288462SApril.Chin@Sun.COM exit $? 12298462SApril.Chin@Sun.COM ;; 12308462SApril.Chin@Sun.COM "getcomments") 12318462SApril.Chin@Sun.COM progname+=" ${cmd}" 12328462SApril.Chin@Sun.COM do_getcomments "$@" 12338462SApril.Chin@Sun.COM exit $? 12348462SApril.Chin@Sun.COM ;; 12358462SApril.Chin@Sun.COM *) 12368462SApril.Chin@Sun.COM usage crawlsrccomments_usage 12378462SApril.Chin@Sun.COM ;; 12388462SApril.Chin@Sun.COMesac 12398462SApril.Chin@Sun.COM 12408462SApril.Chin@Sun.COMfatal_error "not reached." 12418462SApril.Chin@Sun.COM# EOF. 1242