1*8462SApril.Chin@Sun.COM#!/usr/bin/ksh93 2*8462SApril.Chin@Sun.COM 3*8462SApril.Chin@Sun.COM# 4*8462SApril.Chin@Sun.COM# CDDL HEADER START 5*8462SApril.Chin@Sun.COM# 6*8462SApril.Chin@Sun.COM# The contents of this file are subject to the terms of the 7*8462SApril.Chin@Sun.COM# Common Development and Distribution License (the "License"). 8*8462SApril.Chin@Sun.COM# You may not use this file except in compliance with the License. 9*8462SApril.Chin@Sun.COM# 10*8462SApril.Chin@Sun.COM# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 11*8462SApril.Chin@Sun.COM# or http://www.opensolaris.org/os/licensing. 12*8462SApril.Chin@Sun.COM# See the License for the specific language governing permissions 13*8462SApril.Chin@Sun.COM# and limitations under the License. 14*8462SApril.Chin@Sun.COM# 15*8462SApril.Chin@Sun.COM# When distributing Covered Code, include this CDDL HEADER in each 16*8462SApril.Chin@Sun.COM# file and include the License file at usr/src/OPENSOLARIS.LICENSE. 17*8462SApril.Chin@Sun.COM# If applicable, add the following below this CDDL HEADER, with the 18*8462SApril.Chin@Sun.COM# fields enclosed by brackets "[]" replaced with your own identifying 19*8462SApril.Chin@Sun.COM# information: Portions Copyright [yyyy] [name of copyright owner] 20*8462SApril.Chin@Sun.COM# 21*8462SApril.Chin@Sun.COM# CDDL HEADER END 22*8462SApril.Chin@Sun.COM# 23*8462SApril.Chin@Sun.COM 24*8462SApril.Chin@Sun.COM# 25*8462SApril.Chin@Sun.COM# Copyright 2008 Sun Microsystems, Inc. All rights reserved. 26*8462SApril.Chin@Sun.COM# Use is subject to license terms. 27*8462SApril.Chin@Sun.COM# 28*8462SApril.Chin@Sun.COM 29*8462SApril.Chin@Sun.COM# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant 30*8462SApril.Chin@Sun.COMexport PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin 31*8462SApril.Chin@Sun.COM 32*8462SApril.Chin@Sun.COM# Make sure all math stuff runs in the "C" locale to avoid problems 33*8462SApril.Chin@Sun.COM# with alternative # radix point representations (e.g. ',' instead of 34*8462SApril.Chin@Sun.COM# '.' in de_DE.*-locales). This needs to be set _before_ any 35*8462SApril.Chin@Sun.COM# floating-point constants are defined in this script). 36*8462SApril.Chin@Sun.COMif [[ "${LC_ALL}" != "" ]] ; then 37*8462SApril.Chin@Sun.COM export \ 38*8462SApril.Chin@Sun.COM LC_MONETARY="${LC_ALL}" \ 39*8462SApril.Chin@Sun.COM LC_MESSAGES="${LC_ALL}" \ 40*8462SApril.Chin@Sun.COM LC_COLLATE="${LC_ALL}" \ 41*8462SApril.Chin@Sun.COM LC_CTYPE="${LC_ALL}" 42*8462SApril.Chin@Sun.COM unset LC_ALL 43*8462SApril.Chin@Sun.COMfi 44*8462SApril.Chin@Sun.COMexport LC_NUMERIC=C 45*8462SApril.Chin@Sun.COM 46*8462SApril.Chin@Sun.COM# constants values for tokenizer/parser stuff 47*8462SApril.Chin@Sun.COMtypeset -r ch=( 48*8462SApril.Chin@Sun.COM newline=$'\n' 49*8462SApril.Chin@Sun.COM tab=$'\t' 50*8462SApril.Chin@Sun.COM formfeed=$'\f' 51*8462SApril.Chin@Sun.COM) 52*8462SApril.Chin@Sun.COM 53*8462SApril.Chin@Sun.COMfunction fatal_error 54*8462SApril.Chin@Sun.COM{ 55*8462SApril.Chin@Sun.COM print -u2 "${progname}: $*" 56*8462SApril.Chin@Sun.COM exit 1 57*8462SApril.Chin@Sun.COM} 58*8462SApril.Chin@Sun.COM 59*8462SApril.Chin@Sun.COMfunction printmsg 60*8462SApril.Chin@Sun.COM{ 61*8462SApril.Chin@Sun.COM print -u2 "$*" 62*8462SApril.Chin@Sun.COM} 63*8462SApril.Chin@Sun.COM 64*8462SApril.Chin@Sun.COM 65*8462SApril.Chin@Sun.COMfunction attrstrtoattrarray 66*8462SApril.Chin@Sun.COM{ 67*8462SApril.Chin@Sun.COM#set -o xtrace 68*8462SApril.Chin@Sun.COM typeset s="$1" 69*8462SApril.Chin@Sun.COM nameref aa=$2 # attribute array 70*8462SApril.Chin@Sun.COM integer aa_count=0 71*8462SApril.Chin@Sun.COM integer aa_count=0 72*8462SApril.Chin@Sun.COM typeset nextattr 73*8462SApril.Chin@Sun.COM integer currattrlen=0 74*8462SApril.Chin@Sun.COM typeset tagstr 75*8462SApril.Chin@Sun.COM typeset tagval 76*8462SApril.Chin@Sun.COM 77*8462SApril.Chin@Sun.COM while (( ${#s} > 0 )) ; do 78*8462SApril.Chin@Sun.COM # skip whitespaces 79*8462SApril.Chin@Sun.COM while [[ "${s:currattrlen:1}" == ~(E)[[:blank:][:space:]] ]] ; do 80*8462SApril.Chin@Sun.COM (( currattrlen++ )) 81*8462SApril.Chin@Sun.COM done 82*8462SApril.Chin@Sun.COM s="${s:currattrlen:${#s}}" 83*8462SApril.Chin@Sun.COM 84*8462SApril.Chin@Sun.COM # anything left ? 85*8462SApril.Chin@Sun.COM (( ${#s} == 0 )) && break 86*8462SApril.Chin@Sun.COM 87*8462SApril.Chin@Sun.COM # Pattern tests: 88*8462SApril.Chin@Sun.COM #x="foo=bar huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=[^[:blank:]\"]*}" 89*8462SApril.Chin@Sun.COM #x='foo="ba=r o" huz=123' ; print "${x##~(E)[[:alnum:]_-:]*=\"[^\"]*\"}" 90*8462SApril.Chin@Sun.COM #x="foo='ba=r o' huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=\'[^\"]*\'}" 91*8462SApril.Chin@Sun.COM #x="foox huz=123" ; print "${x##~(E)[[:alnum:]_-:]*}" 92*8462SApril.Chin@Sun.COM # All pattern combined via eregex (w|x|y|z): 93*8462SApril.Chin@Sun.COM #x='foo="bar=o" huz=123' ; print "${x##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\')}" 94*8462SApril.Chin@Sun.COM nextattr="${s##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\'|[[:alnum:]_-:]*)}" 95*8462SApril.Chin@Sun.COM currattrlen=$(( ${#s} - ${#nextattr})) 96*8462SApril.Chin@Sun.COM 97*8462SApril.Chin@Sun.COM # add entry 98*8462SApril.Chin@Sun.COM tagstr="${s:0:currattrlen}" 99*8462SApril.Chin@Sun.COM if [[ "${tagstr}" == *=* ]] ; then 100*8462SApril.Chin@Sun.COM # normal case: attribute with value 101*8462SApril.Chin@Sun.COM 102*8462SApril.Chin@Sun.COM tagval="${tagstr#*=}" 103*8462SApril.Chin@Sun.COM 104*8462SApril.Chin@Sun.COM # strip quotes ('' or "") 105*8462SApril.Chin@Sun.COM if [[ "${tagval}" == ~(Elr)(\'.*\'|\".*\") ]] ; then 106*8462SApril.Chin@Sun.COM tagval="${tagval:1:${#tagval}-2}" 107*8462SApril.Chin@Sun.COM fi 108*8462SApril.Chin@Sun.COM 109*8462SApril.Chin@Sun.COM aa[${aa_count}]=( name="${tagstr%%=*}" value="${tagval}" ) 110*8462SApril.Chin@Sun.COM else 111*8462SApril.Chin@Sun.COM # special case for HTML where you have something like <foo baz> 112*8462SApril.Chin@Sun.COM aa[${aa_count}]=( name="${tagstr}" ) 113*8462SApril.Chin@Sun.COM fi 114*8462SApril.Chin@Sun.COM (( aa_count++ )) 115*8462SApril.Chin@Sun.COM (( aa_count > 1000 )) && fatal_error "$0: aa_count too large" # assert 116*8462SApril.Chin@Sun.COM done 117*8462SApril.Chin@Sun.COM} 118*8462SApril.Chin@Sun.COM 119*8462SApril.Chin@Sun.COM# XML document handler 120*8462SApril.Chin@Sun.COMfunction handle_xml_document 121*8462SApril.Chin@Sun.COM{ 122*8462SApril.Chin@Sun.COM#set -o xtrace 123*8462SApril.Chin@Sun.COM nameref callbacks=${1} 124*8462SApril.Chin@Sun.COM typeset tag_type="${2}" 125*8462SApril.Chin@Sun.COM typeset tag_value="${3}" 126*8462SApril.Chin@Sun.COM typeset tag_attributes="${4}" 127*8462SApril.Chin@Sun.COM nameref doc=${callbacks["arg_tree"]} 128*8462SApril.Chin@Sun.COM nameref nodepath="${stack.items[stack.pos]}" 129*8462SApril.Chin@Sun.COM nameref nodesnum="${stack.items[stack.pos]}num" 130*8462SApril.Chin@Sun.COM 131*8462SApril.Chin@Sun.COM case "${tag_type}" in 132*8462SApril.Chin@Sun.COM tag_comment) 133*8462SApril.Chin@Sun.COM nodepath[${nodesnum}]+=( 134*8462SApril.Chin@Sun.COM typeset tagtype="comment" 135*8462SApril.Chin@Sun.COM typeset tagvalue="${tag_value}" 136*8462SApril.Chin@Sun.COM ) 137*8462SApril.Chin@Sun.COM (( nodesnum++ )) 138*8462SApril.Chin@Sun.COM ;; 139*8462SApril.Chin@Sun.COM esac 140*8462SApril.Chin@Sun.COM 141*8462SApril.Chin@Sun.COM# print "xmltok: '${tag_type}' = '${tag_value}'" 142*8462SApril.Chin@Sun.COM} 143*8462SApril.Chin@Sun.COM 144*8462SApril.Chin@Sun.COMfunction xml_tok 145*8462SApril.Chin@Sun.COM{ 146*8462SApril.Chin@Sun.COM typeset buf="" 147*8462SApril.Chin@Sun.COM typeset namebuf="" 148*8462SApril.Chin@Sun.COM typeset attrbuf="" 149*8462SApril.Chin@Sun.COM typeset c="" 150*8462SApril.Chin@Sun.COM typeset isendtag # bool: true/false 151*8462SApril.Chin@Sun.COM typeset issingletag # bool: true/false (used for tags like "<br />") 152*8462SApril.Chin@Sun.COM nameref callbacks=${1} 153*8462SApril.Chin@Sun.COM 154*8462SApril.Chin@Sun.COM [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start" 155*8462SApril.Chin@Sun.COM 156*8462SApril.Chin@Sun.COM while IFS='' read -r -N 1 c ; do 157*8462SApril.Chin@Sun.COM isendtag=false 158*8462SApril.Chin@Sun.COM 159*8462SApril.Chin@Sun.COM if [[ "$c" == "<" ]] ; then 160*8462SApril.Chin@Sun.COM # flush any text content 161*8462SApril.Chin@Sun.COM if [[ "$buf" != "" ]] ; then 162*8462SApril.Chin@Sun.COM [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf" 163*8462SApril.Chin@Sun.COM buf="" 164*8462SApril.Chin@Sun.COM fi 165*8462SApril.Chin@Sun.COM 166*8462SApril.Chin@Sun.COM IFS='' read -r -N 1 c 167*8462SApril.Chin@Sun.COM if [[ "$c" == "/" ]] ; then 168*8462SApril.Chin@Sun.COM isendtag=true 169*8462SApril.Chin@Sun.COM else 170*8462SApril.Chin@Sun.COM buf="$c" 171*8462SApril.Chin@Sun.COM fi 172*8462SApril.Chin@Sun.COM IFS='' read -r -d '>' c 173*8462SApril.Chin@Sun.COM buf+="$c" 174*8462SApril.Chin@Sun.COM 175*8462SApril.Chin@Sun.COM # handle comments 176*8462SApril.Chin@Sun.COM if [[ "$buf" == ~(El)!-- ]] ; then 177*8462SApril.Chin@Sun.COM # did we read the comment completely ? 178*8462SApril.Chin@Sun.COM if [[ "$buf" != ~(Elr)!--.*-- ]] ; then 179*8462SApril.Chin@Sun.COM buf+=">" 180*8462SApril.Chin@Sun.COM while [[ "$buf" != ~(Elr)!--.*-- ]] ; do 181*8462SApril.Chin@Sun.COM IFS='' read -r -N 1 c || break 182*8462SApril.Chin@Sun.COM buf+="$c" 183*8462SApril.Chin@Sun.COM done 184*8462SApril.Chin@Sun.COM fi 185*8462SApril.Chin@Sun.COM 186*8462SApril.Chin@Sun.COM [[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}" 187*8462SApril.Chin@Sun.COM buf="" 188*8462SApril.Chin@Sun.COM continue 189*8462SApril.Chin@Sun.COM fi 190*8462SApril.Chin@Sun.COM 191*8462SApril.Chin@Sun.COM # check if the tag starts and ends at the same time (like "<br />") 192*8462SApril.Chin@Sun.COM if [[ "${buf}" == ~(Er).*/ ]] ; then 193*8462SApril.Chin@Sun.COM issingletag=true 194*8462SApril.Chin@Sun.COM buf="${buf%*/}" 195*8462SApril.Chin@Sun.COM else 196*8462SApril.Chin@Sun.COM issingletag=false 197*8462SApril.Chin@Sun.COM fi 198*8462SApril.Chin@Sun.COM 199*8462SApril.Chin@Sun.COM # check if the tag has attributes (e.g. space after name) 200*8462SApril.Chin@Sun.COM if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then 201*8462SApril.Chin@Sun.COM namebuf="${buf%%~(E)[[:space:][:blank:]].*}" 202*8462SApril.Chin@Sun.COM attrbuf="${buf#~(E).*[[:space:][:blank:]]}" 203*8462SApril.Chin@Sun.COM else 204*8462SApril.Chin@Sun.COM namebuf="$buf" 205*8462SApril.Chin@Sun.COM attrbuf="" 206*8462SApril.Chin@Sun.COM fi 207*8462SApril.Chin@Sun.COM 208*8462SApril.Chin@Sun.COM if ${isendtag} ; then 209*8462SApril.Chin@Sun.COM [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf" 210*8462SApril.Chin@Sun.COM else 211*8462SApril.Chin@Sun.COM [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf" 212*8462SApril.Chin@Sun.COM 213*8462SApril.Chin@Sun.COM # handle tags like <br/> (which are start- and end-tag in one piece) 214*8462SApril.Chin@Sun.COM if ${issingletag} ; then 215*8462SApril.Chin@Sun.COM [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf" 216*8462SApril.Chin@Sun.COM fi 217*8462SApril.Chin@Sun.COM fi 218*8462SApril.Chin@Sun.COM buf="" 219*8462SApril.Chin@Sun.COM else 220*8462SApril.Chin@Sun.COM buf+="$c" 221*8462SApril.Chin@Sun.COM fi 222*8462SApril.Chin@Sun.COM done 223*8462SApril.Chin@Sun.COM 224*8462SApril.Chin@Sun.COM [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success" 225*8462SApril.Chin@Sun.COM 226*8462SApril.Chin@Sun.COM print # final newline to make filters like "sed" happy 227*8462SApril.Chin@Sun.COM} 228*8462SApril.Chin@Sun.COM 229*8462SApril.Chin@Sun.COM# enumerate comments in a shell (or shell-like) script 230*8462SApril.Chin@Sun.COMfunction enumerate_comments_shell 231*8462SApril.Chin@Sun.COM{ 232*8462SApril.Chin@Sun.COM set -o errexit 233*8462SApril.Chin@Sun.COM 234*8462SApril.Chin@Sun.COM typeset input_file="$1" 235*8462SApril.Chin@Sun.COM nameref comment_array="$2" 236*8462SApril.Chin@Sun.COM integer max_num_comments="$3" 237*8462SApril.Chin@Sun.COM integer ca=0 # index in "comment_array" 238*8462SApril.Chin@Sun.COM 239*8462SApril.Chin@Sun.COM integer res=0 240*8462SApril.Chin@Sun.COM 241*8462SApril.Chin@Sun.COM typeset comment="" 242*8462SApril.Chin@Sun.COM 243*8462SApril.Chin@Sun.COM while (( res == 0 )) ; do 244*8462SApril.Chin@Sun.COM IFS='' read -r line 245*8462SApril.Chin@Sun.COM (( res=$? )) 246*8462SApril.Chin@Sun.COM 247*8462SApril.Chin@Sun.COM if [[ "${line}" == ~(El)#.* ]] ; then 248*8462SApril.Chin@Sun.COM comment+="${line#\#}${ch.newline}" 249*8462SApril.Chin@Sun.COM else 250*8462SApril.Chin@Sun.COM if [[ "$comment" != "" ]] ; then 251*8462SApril.Chin@Sun.COM comment_array[ca++]="${comment}" 252*8462SApril.Chin@Sun.COM comment="" 253*8462SApril.Chin@Sun.COM 254*8462SApril.Chin@Sun.COM if (( ca > max_num_comments )) ; then 255*8462SApril.Chin@Sun.COM break 256*8462SApril.Chin@Sun.COM fi 257*8462SApril.Chin@Sun.COM fi 258*8462SApril.Chin@Sun.COM fi 259*8462SApril.Chin@Sun.COM done <"${input_file}" 260*8462SApril.Chin@Sun.COM 261*8462SApril.Chin@Sun.COM return 0 262*8462SApril.Chin@Sun.COM} 263*8462SApril.Chin@Sun.COM 264*8462SApril.Chin@Sun.COM 265*8462SApril.Chin@Sun.COM# enumerate comments in a troff document 266*8462SApril.Chin@Sun.COMfunction enumerate_comments_troff 267*8462SApril.Chin@Sun.COM{ 268*8462SApril.Chin@Sun.COM set -o errexit 269*8462SApril.Chin@Sun.COM 270*8462SApril.Chin@Sun.COM typeset input_file="$1" 271*8462SApril.Chin@Sun.COM nameref comment_array="$2" 272*8462SApril.Chin@Sun.COM integer max_num_comments="$3" 273*8462SApril.Chin@Sun.COM integer ca=0 # index in "comment_array" 274*8462SApril.Chin@Sun.COM 275*8462SApril.Chin@Sun.COM integer res=0 276*8462SApril.Chin@Sun.COM 277*8462SApril.Chin@Sun.COM typeset comment="" 278*8462SApril.Chin@Sun.COM 279*8462SApril.Chin@Sun.COM while (( res == 0 )) ; do 280*8462SApril.Chin@Sun.COM IFS='' read -r line 281*8462SApril.Chin@Sun.COM (( res=$? )) 282*8462SApril.Chin@Sun.COM 283*8462SApril.Chin@Sun.COM if [[ "${line}" == ~(El)\.*\\\" ]] ; then 284*8462SApril.Chin@Sun.COM comment+="${line#~(El)\.*\\\"}${ch.newline}" 285*8462SApril.Chin@Sun.COM else 286*8462SApril.Chin@Sun.COM if [[ "$comment" != "" ]] ; then 287*8462SApril.Chin@Sun.COM comment_array[ca++]="${comment}" 288*8462SApril.Chin@Sun.COM comment="" 289*8462SApril.Chin@Sun.COM 290*8462SApril.Chin@Sun.COM if (( ca > max_num_comments )) ; then 291*8462SApril.Chin@Sun.COM break 292*8462SApril.Chin@Sun.COM fi 293*8462SApril.Chin@Sun.COM fi 294*8462SApril.Chin@Sun.COM fi 295*8462SApril.Chin@Sun.COM done <"${input_file}" 296*8462SApril.Chin@Sun.COM 297*8462SApril.Chin@Sun.COM return 0 298*8462SApril.Chin@Sun.COM} 299*8462SApril.Chin@Sun.COM 300*8462SApril.Chin@Sun.COM 301*8462SApril.Chin@Sun.COM# enumerate comments in files which are preprocessed by 302*8462SApril.Chin@Sun.COM# CPP (e.g. C, C++, Imakefile etc.) 303*8462SApril.Chin@Sun.COMfunction enumerate_comments_cpp 304*8462SApril.Chin@Sun.COM{ 305*8462SApril.Chin@Sun.COM set -o errexit 306*8462SApril.Chin@Sun.COM# set -o nounset 307*8462SApril.Chin@Sun.COM 308*8462SApril.Chin@Sun.COM integer err=0 309*8462SApril.Chin@Sun.COM 310*8462SApril.Chin@Sun.COM typeset input_file="$1" 311*8462SApril.Chin@Sun.COM nameref comment_array="$2" 312*8462SApril.Chin@Sun.COM integer max_num_comments="$3" 313*8462SApril.Chin@Sun.COM integer max_filesize_for_scan="$4" 314*8462SApril.Chin@Sun.COM integer ca=0 # index in "comment_array" 315*8462SApril.Chin@Sun.COM 316*8462SApril.Chin@Sun.COM typeset content 317*8462SApril.Chin@Sun.COM integer content_length 318*8462SApril.Chin@Sun.COM 319*8462SApril.Chin@Sun.COM integer file_pos # file position 320*8462SApril.Chin@Sun.COM typeset line_pos=( 321*8462SApril.Chin@Sun.COM integer x=0 # X position in line 322*8462SApril.Chin@Sun.COM integer y=0 # Y position in line (line number) 323*8462SApril.Chin@Sun.COM ) 324*8462SApril.Chin@Sun.COM typeset c c2 325*8462SApril.Chin@Sun.COM 326*8462SApril.Chin@Sun.COM typeset comment 327*8462SApril.Chin@Sun.COM 328*8462SApril.Chin@Sun.COM typeset state=( 329*8462SApril.Chin@Sun.COM # C comment state 330*8462SApril.Chin@Sun.COM typeset in_c_comment=false 331*8462SApril.Chin@Sun.COM # C++ comment state 332*8462SApril.Chin@Sun.COM typeset cxx=( 333*8462SApril.Chin@Sun.COM typeset in_comment=false 334*8462SApril.Chin@Sun.COM typeset comment_continued=false 335*8462SApril.Chin@Sun.COM # position of current //-pos 336*8462SApril.Chin@Sun.COM typeset comment_pos=( 337*8462SApril.Chin@Sun.COM integer x=-1 338*8462SApril.Chin@Sun.COM integer y=-1 339*8462SApril.Chin@Sun.COM ) 340*8462SApril.Chin@Sun.COM # position of previous //-pos 341*8462SApril.Chin@Sun.COM typeset comment_prev_pos=( 342*8462SApril.Chin@Sun.COM integer x=-1 343*8462SApril.Chin@Sun.COM integer y=-1 344*8462SApril.Chin@Sun.COM ) 345*8462SApril.Chin@Sun.COM ) 346*8462SApril.Chin@Sun.COM # literal state 347*8462SApril.Chin@Sun.COM typeset in_sq_literal=false # single-quote literal 348*8462SApril.Chin@Sun.COM typeset in_dq_literal=false # double-quote literal 349*8462SApril.Chin@Sun.COM ) 350*8462SApril.Chin@Sun.COM 351*8462SApril.Chin@Sun.COM content="$(< "${input_file}")" 352*8462SApril.Chin@Sun.COM 353*8462SApril.Chin@Sun.COM # Truncate file to "max_filesize_for_scan" charatcters. 354*8462SApril.Chin@Sun.COM # This was originally added to work around a performance problem with 355*8462SApril.Chin@Sun.COM # the ${str:offset:chunksize} operator which scales badly in ksh93 356*8462SApril.Chin@Sun.COM # version 's' with the number of characters 357*8462SApril.Chin@Sun.COM if (( ${#content} > max_filesize_for_scan )) ; then 358*8462SApril.Chin@Sun.COM print -u2 -f "## WARNING: File '%s' truncated to %d characters\n" \ 359*8462SApril.Chin@Sun.COM "${input_file}" \ 360*8462SApril.Chin@Sun.COM max_filesize_for_scan 361*8462SApril.Chin@Sun.COM content="${content:0:max_filesize_for_scan}" 362*8462SApril.Chin@Sun.COM fi 363*8462SApril.Chin@Sun.COM content_length=${#content} 364*8462SApril.Chin@Sun.COM 365*8462SApril.Chin@Sun.COM # Iterate through the source code. The last character 366*8462SApril.Chin@Sun.COM # (when file_pos == content_length) will be empty to indicate 367*8462SApril.Chin@Sun.COM # EOF (this is needed for cases like when 368*8462SApril.Chin@Sun.COM # a C++ comment is not terminated by a newline... ;-/) 369*8462SApril.Chin@Sun.COM for (( file_pos=0 ; file_pos <= content_length ; file_pos++ )) ; do 370*8462SApril.Chin@Sun.COM c2="${content:file_pos:2}" 371*8462SApril.Chin@Sun.COM c="${c2:0:1}" 372*8462SApril.Chin@Sun.COM 373*8462SApril.Chin@Sun.COM if [[ "$c" == "${ch.newline}" ]] ; then 374*8462SApril.Chin@Sun.COM (( line_pos.x=0, line_pos.y++ )) 375*8462SApril.Chin@Sun.COM else 376*8462SApril.Chin@Sun.COM (( line_pos.x++ )) 377*8462SApril.Chin@Sun.COM fi 378*8462SApril.Chin@Sun.COM 379*8462SApril.Chin@Sun.COM if ${state.in_c_comment} ; then 380*8462SApril.Chin@Sun.COM if [[ "$c2" == "*/" ]] ; then 381*8462SApril.Chin@Sun.COM (( file_pos++, line_pos.x++ )) 382*8462SApril.Chin@Sun.COM state.in_c_comment=false 383*8462SApril.Chin@Sun.COM 384*8462SApril.Chin@Sun.COM # flush comment text 385*8462SApril.Chin@Sun.COM comment_array[ca++]="${comment}" 386*8462SApril.Chin@Sun.COM comment="" 387*8462SApril.Chin@Sun.COM 388*8462SApril.Chin@Sun.COM if (( ca > max_num_comments )) ; then 389*8462SApril.Chin@Sun.COM break 390*8462SApril.Chin@Sun.COM fi 391*8462SApril.Chin@Sun.COM else 392*8462SApril.Chin@Sun.COM comment+="$c" 393*8462SApril.Chin@Sun.COM fi 394*8462SApril.Chin@Sun.COM elif ${state.cxx.in_comment} ; then 395*8462SApril.Chin@Sun.COM if [[ "$c" == "${ch.newline}" || "$c" == "" ]] ; then 396*8462SApril.Chin@Sun.COM state.cxx.in_comment=false 397*8462SApril.Chin@Sun.COM 398*8462SApril.Chin@Sun.COM # flush comment text 399*8462SApril.Chin@Sun.COM if ${state.cxx.comment_continued} ; then 400*8462SApril.Chin@Sun.COM comment_array[ca-1]+="${ch.newline}${comment}" 401*8462SApril.Chin@Sun.COM (( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x , 402*8462SApril.Chin@Sun.COM state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y )) 403*8462SApril.Chin@Sun.COM else 404*8462SApril.Chin@Sun.COM comment_array[ca++]="${comment}" 405*8462SApril.Chin@Sun.COM (( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x , 406*8462SApril.Chin@Sun.COM state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y )) 407*8462SApril.Chin@Sun.COM fi 408*8462SApril.Chin@Sun.COM comment="" 409*8462SApril.Chin@Sun.COM 410*8462SApril.Chin@Sun.COM if (( ca > max_num_comments )) ; then 411*8462SApril.Chin@Sun.COM break 412*8462SApril.Chin@Sun.COM fi 413*8462SApril.Chin@Sun.COM else 414*8462SApril.Chin@Sun.COM comment+="$c" 415*8462SApril.Chin@Sun.COM fi 416*8462SApril.Chin@Sun.COM elif ${state.in_sq_literal} ; then 417*8462SApril.Chin@Sun.COM if [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then 418*8462SApril.Chin@Sun.COM state.in_sq_literal=false 419*8462SApril.Chin@Sun.COM fi 420*8462SApril.Chin@Sun.COM elif ${state.in_dq_literal} ; then 421*8462SApril.Chin@Sun.COM if [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then 422*8462SApril.Chin@Sun.COM state.in_dq_literal=false 423*8462SApril.Chin@Sun.COM fi 424*8462SApril.Chin@Sun.COM else 425*8462SApril.Chin@Sun.COM if [[ "$c2" == "/*" ]] ; then 426*8462SApril.Chin@Sun.COM (( file_pos++, line_pos.x++ )) 427*8462SApril.Chin@Sun.COM state.in_c_comment=true 428*8462SApril.Chin@Sun.COM comment="" 429*8462SApril.Chin@Sun.COM elif [[ "$c2" == "//" ]] ; then 430*8462SApril.Chin@Sun.COM (( file_pos++, line_pos.x++ )) 431*8462SApril.Chin@Sun.COM if (( state.cxx.comment_prev_pos.x == line_pos.x && \ 432*8462SApril.Chin@Sun.COM state.cxx.comment_prev_pos.y == (line_pos.y-1) )) ; then 433*8462SApril.Chin@Sun.COM state.cxx.comment_continued=true 434*8462SApril.Chin@Sun.COM else 435*8462SApril.Chin@Sun.COM state.cxx.comment_continued=false 436*8462SApril.Chin@Sun.COM fi 437*8462SApril.Chin@Sun.COM (( state.cxx.comment_pos.x=line_pos.x , state.cxx.comment_pos.y=line_pos.y )) 438*8462SApril.Chin@Sun.COM state.cxx.in_comment=true 439*8462SApril.Chin@Sun.COM comment="" 440*8462SApril.Chin@Sun.COM elif [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then 441*8462SApril.Chin@Sun.COM state.in_sq_literal=true 442*8462SApril.Chin@Sun.COM elif [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then 443*8462SApril.Chin@Sun.COM state.in_dq_literal=true 444*8462SApril.Chin@Sun.COM fi 445*8462SApril.Chin@Sun.COM fi 446*8462SApril.Chin@Sun.COM done 447*8462SApril.Chin@Sun.COM 448*8462SApril.Chin@Sun.COM if [[ "$comment" != "" ]] ; then 449*8462SApril.Chin@Sun.COM print -u2 "## ERROR: Comment text buffer not empty at EOF." 450*8462SApril.Chin@Sun.COM err=1 451*8462SApril.Chin@Sun.COM fi 452*8462SApril.Chin@Sun.COM 453*8462SApril.Chin@Sun.COM if ${state.in_c_comment} ; then 454*8462SApril.Chin@Sun.COM print -u2 "## ERROR: C comment did not close before EOF." 455*8462SApril.Chin@Sun.COM err=1 456*8462SApril.Chin@Sun.COM fi 457*8462SApril.Chin@Sun.COM 458*8462SApril.Chin@Sun.COM if ${state.cxx.in_comment} ; then 459*8462SApril.Chin@Sun.COM print -u2 "## ERROR: C++ comment did not close before EOF." 460*8462SApril.Chin@Sun.COM err=1 461*8462SApril.Chin@Sun.COM fi 462*8462SApril.Chin@Sun.COM 463*8462SApril.Chin@Sun.COM if ${state.in_dq_literal} ; then 464*8462SApril.Chin@Sun.COM print -u2 "## ERROR: Double-quoted literal did not close before EOF." 465*8462SApril.Chin@Sun.COM err=1 466*8462SApril.Chin@Sun.COM fi 467*8462SApril.Chin@Sun.COM 468*8462SApril.Chin@Sun.COM # We treat this one only as warning since things like "foo.html.cpp" may 469*8462SApril.Chin@Sun.COM # trigger this condition accidently 470*8462SApril.Chin@Sun.COM if ${state.in_sq_literal} ; then 471*8462SApril.Chin@Sun.COM print -u2 "## WARNING: Single-quoted literal did not close before EOF." 472*8462SApril.Chin@Sun.COM fi 473*8462SApril.Chin@Sun.COM 474*8462SApril.Chin@Sun.COM return $err 475*8462SApril.Chin@Sun.COM} 476*8462SApril.Chin@Sun.COM 477*8462SApril.Chin@Sun.COM# determine file type 478*8462SApril.Chin@Sun.COMfunction get_file_format 479*8462SApril.Chin@Sun.COM{ 480*8462SApril.Chin@Sun.COM set -o errexit 481*8462SApril.Chin@Sun.COM 482*8462SApril.Chin@Sun.COM typeset filename="$1" 483*8462SApril.Chin@Sun.COM nameref file_format="$2" 484*8462SApril.Chin@Sun.COM 485*8462SApril.Chin@Sun.COM typeset fileeval # evaluation result of /usr/bin/file 486*8462SApril.Chin@Sun.COM 487*8462SApril.Chin@Sun.COM # check whether "filename" is a plain, readable file 488*8462SApril.Chin@Sun.COM [[ ! -f "$filename" ]] && return 1 489*8462SApril.Chin@Sun.COM [[ ! -r "$filename" ]] && return 1 490*8462SApril.Chin@Sun.COM 491*8462SApril.Chin@Sun.COM # In theory this code would exclusively look at the contents of 492*8462SApril.Chin@Sun.COM # the file to figure out it's file format - unfortunately 493*8462SApril.Chin@Sun.COM # /usr/bin/file is virtually useless (the heuristics, matching 494*8462SApril.Chin@Sun.COM # and output unreliable) for many file formats and therefore 495*8462SApril.Chin@Sun.COM # we have to do a multi-stage approach which looks 496*8462SApril.Chin@Sun.COM # at the file's content if possible and at the filename 497*8462SApril.Chin@Sun.COM # otherwise. Fun... ;-( 498*8462SApril.Chin@Sun.COM 499*8462SApril.Chin@Sun.COM # pass one: Find matches for file formats where /usr/bin/file 500*8462SApril.Chin@Sun.COM # is known to be unreliable: 501*8462SApril.Chin@Sun.COM case "$filename" in 502*8462SApril.Chin@Sun.COM *.[ch] | *.cpp | *.cc | *.cxx | *.hxx) 503*8462SApril.Chin@Sun.COM file_format="c_source" 504*8462SApril.Chin@Sun.COM return 0 505*8462SApril.Chin@Sun.COM ;; 506*8462SApril.Chin@Sun.COM *Imakefile) 507*8462SApril.Chin@Sun.COM file_format="imakefile" 508*8462SApril.Chin@Sun.COM return 0 509*8462SApril.Chin@Sun.COM ;; 510*8462SApril.Chin@Sun.COM *Makefile) 511*8462SApril.Chin@Sun.COM file_format="makefile" 512*8462SApril.Chin@Sun.COM return 0 513*8462SApril.Chin@Sun.COM ;; 514*8462SApril.Chin@Sun.COM esac 515*8462SApril.Chin@Sun.COM 516*8462SApril.Chin@Sun.COM # pass two: match by file content via /usr/bin/file 517*8462SApril.Chin@Sun.COM fileeval="$(LC_ALL=C /usr/bin/file "$filename")" 518*8462SApril.Chin@Sun.COM case "$fileeval" in 519*8462SApril.Chin@Sun.COM ~(E)roff) 520*8462SApril.Chin@Sun.COM file_format="troff" 521*8462SApril.Chin@Sun.COM return 0 522*8462SApril.Chin@Sun.COM ;; 523*8462SApril.Chin@Sun.COM ~(E)html\ document) 524*8462SApril.Chin@Sun.COM file_format="html" 525*8462SApril.Chin@Sun.COM return 0 526*8462SApril.Chin@Sun.COM ;; 527*8462SApril.Chin@Sun.COM ~(E)sgml\ document) 528*8462SApril.Chin@Sun.COM file_format="sgml" 529*8462SApril.Chin@Sun.COM return 0 530*8462SApril.Chin@Sun.COM ;; 531*8462SApril.Chin@Sun.COM ~(E)executable.*(shell|(/|/r|/pf)(sh|ksh|ksh93|rksh93|dtksh|tksh|bash))\ script) 532*8462SApril.Chin@Sun.COM file_format="shell" 533*8462SApril.Chin@Sun.COM return 0 534*8462SApril.Chin@Sun.COM ;; 535*8462SApril.Chin@Sun.COM ~(E)executable.*/perl\ script) 536*8462SApril.Chin@Sun.COM file_format="perl" 537*8462SApril.Chin@Sun.COM return 0 538*8462SApril.Chin@Sun.COM ;; 539*8462SApril.Chin@Sun.COM esac 540*8462SApril.Chin@Sun.COM 541*8462SApril.Chin@Sun.COM # pass three: fallhack to filename matching 542*8462SApril.Chin@Sun.COM case "$filename" in 543*8462SApril.Chin@Sun.COM *.man) 544*8462SApril.Chin@Sun.COM file_format="troff" 545*8462SApril.Chin@Sun.COM return 0 546*8462SApril.Chin@Sun.COM ;; 547*8462SApril.Chin@Sun.COM *.html) 548*8462SApril.Chin@Sun.COM file_format="html" 549*8462SApril.Chin@Sun.COM return 0 550*8462SApril.Chin@Sun.COM ;; 551*8462SApril.Chin@Sun.COM *.sgml) 552*8462SApril.Chin@Sun.COM file_format="sgml" 553*8462SApril.Chin@Sun.COM return 0 554*8462SApril.Chin@Sun.COM ;; 555*8462SApril.Chin@Sun.COM *.xml) 556*8462SApril.Chin@Sun.COM file_format="xml" 557*8462SApril.Chin@Sun.COM return 0 558*8462SApril.Chin@Sun.COM ;; 559*8462SApril.Chin@Sun.COM *.png) 560*8462SApril.Chin@Sun.COM file_format="image_png" 561*8462SApril.Chin@Sun.COM return 0 562*8462SApril.Chin@Sun.COM ;; 563*8462SApril.Chin@Sun.COM *.xcf) 564*8462SApril.Chin@Sun.COM file_format="image_xcf" 565*8462SApril.Chin@Sun.COM return 0 566*8462SApril.Chin@Sun.COM ;; 567*8462SApril.Chin@Sun.COM *.shar) 568*8462SApril.Chin@Sun.COM file_format="archive_shell" 569*8462SApril.Chin@Sun.COM return 0 570*8462SApril.Chin@Sun.COM ;; 571*8462SApril.Chin@Sun.COM *.sh) 572*8462SApril.Chin@Sun.COM file_format="shell" 573*8462SApril.Chin@Sun.COM return 0 574*8462SApril.Chin@Sun.COM ;; 575*8462SApril.Chin@Sun.COM *.pcf) 576*8462SApril.Chin@Sun.COM file_format="font_pcf" 577*8462SApril.Chin@Sun.COM return 0 578*8462SApril.Chin@Sun.COM ;; 579*8462SApril.Chin@Sun.COM *.bdf) 580*8462SApril.Chin@Sun.COM file_format="font_bdf" 581*8462SApril.Chin@Sun.COM return 0 582*8462SApril.Chin@Sun.COM ;; 583*8462SApril.Chin@Sun.COM *.pmf) 584*8462SApril.Chin@Sun.COM file_format="font_pmf" 585*8462SApril.Chin@Sun.COM return 0 586*8462SApril.Chin@Sun.COM ;; 587*8462SApril.Chin@Sun.COM *.ttf | *.otf) 588*8462SApril.Chin@Sun.COM file_format="font_ttf" 589*8462SApril.Chin@Sun.COM return 0 590*8462SApril.Chin@Sun.COM ;; 591*8462SApril.Chin@Sun.COM *.pfa | *.pfb) 592*8462SApril.Chin@Sun.COM file_format="font_postscript" 593*8462SApril.Chin@Sun.COM return 0 594*8462SApril.Chin@Sun.COM ;; 595*8462SApril.Chin@Sun.COM esac 596*8462SApril.Chin@Sun.COM 597*8462SApril.Chin@Sun.COM return 1 598*8462SApril.Chin@Sun.COM} 599*8462SApril.Chin@Sun.COM 600*8462SApril.Chin@Sun.COMfunction extract_comments 601*8462SApril.Chin@Sun.COM{ 602*8462SApril.Chin@Sun.COM set -o errexit 603*8462SApril.Chin@Sun.COM 604*8462SApril.Chin@Sun.COM nameref records="$1" 605*8462SApril.Chin@Sun.COM typeset filename="$2" 606*8462SApril.Chin@Sun.COM integer max_num_comments="$3" 607*8462SApril.Chin@Sun.COM integer max_filesize_for_scan="$4" 608*8462SApril.Chin@Sun.COM 609*8462SApril.Chin@Sun.COM typeset datatype="" 610*8462SApril.Chin@Sun.COM 611*8462SApril.Chin@Sun.COM records[${filename}]=( 612*8462SApril.Chin@Sun.COM typeset filename="$filename" 613*8462SApril.Chin@Sun.COM 614*8462SApril.Chin@Sun.COM typeset fileformat_found="false" # "true" or "false" 615*8462SApril.Chin@Sun.COM typeset file_format="" 616*8462SApril.Chin@Sun.COM 617*8462SApril.Chin@Sun.COM typeset -A hashsum 618*8462SApril.Chin@Sun.COM 619*8462SApril.Chin@Sun.COM typeset comments_parsed="false" # "true" or "false" 620*8462SApril.Chin@Sun.COM typeset -a comments 621*8462SApril.Chin@Sun.COM ) 622*8462SApril.Chin@Sun.COM 623*8462SApril.Chin@Sun.COM records[${filename}].hashsum["md5"]="$(sum -x md5 < "$filename")" 624*8462SApril.Chin@Sun.COM records[${filename}].hashsum["sha1"]="$(sum -x sha1 < "$filename")" 625*8462SApril.Chin@Sun.COM 626*8462SApril.Chin@Sun.COM if get_file_format "$filename" datatype ; then 627*8462SApril.Chin@Sun.COM records[${filename}].fileformat_found="true" 628*8462SApril.Chin@Sun.COM records[${filename}].file_format="$datatype" 629*8462SApril.Chin@Sun.COM else 630*8462SApril.Chin@Sun.COM return 1 631*8462SApril.Chin@Sun.COM fi 632*8462SApril.Chin@Sun.COM 633*8462SApril.Chin@Sun.COM case "$datatype" in 634*8462SApril.Chin@Sun.COM c_source|imakefile) 635*8462SApril.Chin@Sun.COM enumerate_comments_cpp "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \ 636*8462SApril.Chin@Sun.COM records[${filename}].comments_parsed=true 637*8462SApril.Chin@Sun.COM ;; 638*8462SApril.Chin@Sun.COM shell|makefile) 639*8462SApril.Chin@Sun.COM enumerate_comments_shell "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \ 640*8462SApril.Chin@Sun.COM records[${filename}].comments_parsed=true 641*8462SApril.Chin@Sun.COM ;; 642*8462SApril.Chin@Sun.COM troff) 643*8462SApril.Chin@Sun.COM enumerate_comments_troff "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \ 644*8462SApril.Chin@Sun.COM records[${filename}].comments_parsed=true 645*8462SApril.Chin@Sun.COM ;; 646*8462SApril.Chin@Sun.COM # NOTE: Disabled for now 647*8462SApril.Chin@Sun.COM #xml|html|sgml) 648*8462SApril.Chin@Sun.COM # enumerate_comments_xml "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \ 649*8462SApril.Chin@Sun.COM # records[${filename}].comments_parsed=true 650*8462SApril.Chin@Sun.COM # ;; 651*8462SApril.Chin@Sun.COM esac 652*8462SApril.Chin@Sun.COM 653*8462SApril.Chin@Sun.COM return 0 654*8462SApril.Chin@Sun.COM} 655*8462SApril.Chin@Sun.COM 656*8462SApril.Chin@Sun.COM# parse HTTP return code, cookies etc. 657*8462SApril.Chin@Sun.COMfunction parse_http_response 658*8462SApril.Chin@Sun.COM{ 659*8462SApril.Chin@Sun.COM nameref response="$1" 660*8462SApril.Chin@Sun.COM typeset h statuscode statusmsg i 661*8462SApril.Chin@Sun.COM 662*8462SApril.Chin@Sun.COM # we use '\r' as additional IFS to filter the final '\r' 663*8462SApril.Chin@Sun.COM IFS=$' \t\r' read -r h statuscode statusmsg # read HTTP/1.[01] <code> 664*8462SApril.Chin@Sun.COM [[ "$h" != ~(Eil)HTTP/.* ]] && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; } 665*8462SApril.Chin@Sun.COM [[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n" "$0" ; return 1 ; } 666*8462SApril.Chin@Sun.COM response.statuscode="$statuscode" 667*8462SApril.Chin@Sun.COM response.statusmsg="$statusmsg" 668*8462SApril.Chin@Sun.COM 669*8462SApril.Chin@Sun.COM # skip remaining headers 670*8462SApril.Chin@Sun.COM while IFS='' read -r i ; do 671*8462SApril.Chin@Sun.COM [[ "$i" == $'\r' ]] && break 672*8462SApril.Chin@Sun.COM 673*8462SApril.Chin@Sun.COM # strip '\r' at the end 674*8462SApril.Chin@Sun.COM i="${i/~(Er)$'\r'/}" 675*8462SApril.Chin@Sun.COM 676*8462SApril.Chin@Sun.COM case "$i" in 677*8462SApril.Chin@Sun.COM ~(Eli)Content-Type:.*) 678*8462SApril.Chin@Sun.COM response.content_type="${i/~(El).*:[[:blank:]]*/}" 679*8462SApril.Chin@Sun.COM ;; 680*8462SApril.Chin@Sun.COM ~(Eli)Content-Length:[[:blank:]]*[0-9]*) 681*8462SApril.Chin@Sun.COM integer response.content_length="${i/~(El).*:[[:blank:]]*/}" 682*8462SApril.Chin@Sun.COM ;; 683*8462SApril.Chin@Sun.COM ~(Eli)Transfer-Encoding:.*) 684*8462SApril.Chin@Sun.COM response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}" 685*8462SApril.Chin@Sun.COM ;; 686*8462SApril.Chin@Sun.COM esac 687*8462SApril.Chin@Sun.COM done 688*8462SApril.Chin@Sun.COM 689*8462SApril.Chin@Sun.COM return 0 690*8462SApril.Chin@Sun.COM} 691*8462SApril.Chin@Sun.COM 692*8462SApril.Chin@Sun.COMfunction cat_http_body 693*8462SApril.Chin@Sun.COM{ 694*8462SApril.Chin@Sun.COM typeset emode="$1" 695*8462SApril.Chin@Sun.COM typeset hexchunksize="0" 696*8462SApril.Chin@Sun.COM integer chunksize=0 697*8462SApril.Chin@Sun.COM 698*8462SApril.Chin@Sun.COM if [[ "${emode}" == "chunked" ]] ; then 699*8462SApril.Chin@Sun.COM while IFS=$'\r' read hexchunksize && 700*8462SApril.Chin@Sun.COM [[ "${hexchunksize}" == ~(Elri)[0-9abcdef]* ]] && 701*8462SApril.Chin@Sun.COM (( chunksize=16#${hexchunksize} )) && (( chunksize > 0 )) ; do 702*8462SApril.Chin@Sun.COM dd bs=1 count="${chunksize}" 2>/dev/null 703*8462SApril.Chin@Sun.COM done 704*8462SApril.Chin@Sun.COM else 705*8462SApril.Chin@Sun.COM cat 706*8462SApril.Chin@Sun.COM fi 707*8462SApril.Chin@Sun.COM 708*8462SApril.Chin@Sun.COM return 0 709*8462SApril.Chin@Sun.COM} 710*8462SApril.Chin@Sun.COM 711*8462SApril.Chin@Sun.COMfunction cat_http 712*8462SApril.Chin@Sun.COM{ 713*8462SApril.Chin@Sun.COM typeset protocol="${1%://*}" 714*8462SApril.Chin@Sun.COM typeset path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html" 715*8462SApril.Chin@Sun.COM 716*8462SApril.Chin@Sun.COM typeset host="${path1%%/*}" 717*8462SApril.Chin@Sun.COM typeset path="${path1#*/}" 718*8462SApril.Chin@Sun.COM typeset port="${host##*:}" 719*8462SApril.Chin@Sun.COM 720*8462SApril.Chin@Sun.COM integer netfd 721*8462SApril.Chin@Sun.COM typeset -C httpresponse # http response 722*8462SApril.Chin@Sun.COM 723*8462SApril.Chin@Sun.COM # If URL did not contain a port number in the host part then look at the 724*8462SApril.Chin@Sun.COM # protocol to get the port number 725*8462SApril.Chin@Sun.COM if [[ "${port}" == "${host}" ]] ; then 726*8462SApril.Chin@Sun.COM case "${protocol}" in 727*8462SApril.Chin@Sun.COM "http") port=80 ;; 728*8462SApril.Chin@Sun.COM *) port="$(getent services "${protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;; 729*8462SApril.Chin@Sun.COM esac 730*8462SApril.Chin@Sun.COM else 731*8462SApril.Chin@Sun.COM host="${host%:*}" 732*8462SApril.Chin@Sun.COM fi 733*8462SApril.Chin@Sun.COM 734*8462SApril.Chin@Sun.COM printmsg "protocol=${protocol} port=${port} host=${host} path=${path}" 735*8462SApril.Chin@Sun.COM 736*8462SApril.Chin@Sun.COM # prechecks 737*8462SApril.Chin@Sun.COM [[ "${protocol}" == "" ]] && { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; } 738*8462SApril.Chin@Sun.COM [[ "${port}" == "" ]] && { print -u2 -f "%s: port not set.\n" "$0" ; return 1 ; } 739*8462SApril.Chin@Sun.COM [[ "${host}" == "" ]] && { print -u2 -f "%s: host not set.\n" "$0" ; return 1 ; } 740*8462SApril.Chin@Sun.COM [[ "${path}" == "" ]] && { print -u2 -f "%s: path not set.\n" "$0" ; return 1 ; } 741*8462SApril.Chin@Sun.COM 742*8462SApril.Chin@Sun.COM # open TCP channel 743*8462SApril.Chin@Sun.COM redirect {netfd}<>"/dev/tcp/${host}/${port}" 744*8462SApril.Chin@Sun.COM (( $? != 0 )) && { print -u2 -f "%s: Couldn't open %s\n" "$0" "${1}" ; return 1 ; } 745*8462SApril.Chin@Sun.COM 746*8462SApril.Chin@Sun.COM # send HTTP request 747*8462SApril.Chin@Sun.COM request="GET /${path} HTTP/1.1\r\n" 748*8462SApril.Chin@Sun.COM request+="Host: ${host}\r\n" 749*8462SApril.Chin@Sun.COM request+="User-Agent: crawlsrccomments/ksh93 (2008-06-14; $(uname -s -r -p))\r\n" 750*8462SApril.Chin@Sun.COM request+="Connection: close\r\n" 751*8462SApril.Chin@Sun.COM print -n -- "${request}\r\n" >&${netfd} 752*8462SApril.Chin@Sun.COM 753*8462SApril.Chin@Sun.COM # collect response and send it to stdout 754*8462SApril.Chin@Sun.COM parse_http_response httpresponse <&${netfd} 755*8462SApril.Chin@Sun.COM cat_http_body "${httpresponse.transfer_encoding}" <&${netfd} 756*8462SApril.Chin@Sun.COM 757*8462SApril.Chin@Sun.COM # close connection 758*8462SApril.Chin@Sun.COM redirect {netfd}<&- 759*8462SApril.Chin@Sun.COM 760*8462SApril.Chin@Sun.COM return 0 761*8462SApril.Chin@Sun.COM} 762*8462SApril.Chin@Sun.COM 763*8462SApril.Chin@Sun.COMfunction print_stats 764*8462SApril.Chin@Sun.COM{ 765*8462SApril.Chin@Sun.COM set -o errexit 766*8462SApril.Chin@Sun.COM 767*8462SApril.Chin@Sun.COM # gather some statistics 768*8462SApril.Chin@Sun.COM typeset stats=( 769*8462SApril.Chin@Sun.COM integer files_with_comments=0 770*8462SApril.Chin@Sun.COM integer files_without_comments=0 771*8462SApril.Chin@Sun.COM 772*8462SApril.Chin@Sun.COM integer files_without_known_format=0 773*8462SApril.Chin@Sun.COM 774*8462SApril.Chin@Sun.COM integer files_with_license_info=0 775*8462SApril.Chin@Sun.COM integer files_without_license_info=0 776*8462SApril.Chin@Sun.COM 777*8462SApril.Chin@Sun.COM integer total_num_files=0 778*8462SApril.Chin@Sun.COM ) 779*8462SApril.Chin@Sun.COM 780*8462SApril.Chin@Sun.COM for i in $(printf "%s\n" "${!records[@]}" | sort) ; do 781*8462SApril.Chin@Sun.COM if "${records[$i].comments_parsed}" ; then 782*8462SApril.Chin@Sun.COM (( stats.files_with_comments++ )) 783*8462SApril.Chin@Sun.COM else 784*8462SApril.Chin@Sun.COM (( stats.files_without_comments++ )) 785*8462SApril.Chin@Sun.COM fi 786*8462SApril.Chin@Sun.COM 787*8462SApril.Chin@Sun.COM if ! "${records[$i].fileformat_found}" ; then 788*8462SApril.Chin@Sun.COM (( stats.files_without_known_format++ )) 789*8462SApril.Chin@Sun.COM fi 790*8462SApril.Chin@Sun.COM 791*8462SApril.Chin@Sun.COM if "${records[$i].license_info_found}" ; then 792*8462SApril.Chin@Sun.COM (( stats.files_with_license_info++ )) 793*8462SApril.Chin@Sun.COM else 794*8462SApril.Chin@Sun.COM (( stats.files_without_license_info++ )) 795*8462SApril.Chin@Sun.COM fi 796*8462SApril.Chin@Sun.COM 797*8462SApril.Chin@Sun.COM (( stats.total_num_files++ )) 798*8462SApril.Chin@Sun.COM done 799*8462SApril.Chin@Sun.COM 800*8462SApril.Chin@Sun.COM printf "%B\n" stats 801*8462SApril.Chin@Sun.COM return 0 802*8462SApril.Chin@Sun.COM} 803*8462SApril.Chin@Sun.COM 804*8462SApril.Chin@Sun.COM 805*8462SApril.Chin@Sun.COMfunction print_comments_plain 806*8462SApril.Chin@Sun.COM{ 807*8462SApril.Chin@Sun.COM set -o errexit 808*8462SApril.Chin@Sun.COM 809*8462SApril.Chin@Sun.COM nameref records=$1 810*8462SApril.Chin@Sun.COM nameref options=$2 811*8462SApril.Chin@Sun.COM typeset i j 812*8462SApril.Chin@Sun.COM 813*8462SApril.Chin@Sun.COM for i in $(printf "%s\n" "${!records[@]}" | sort) ; do 814*8462SApril.Chin@Sun.COM nameref node=records[$i] 815*8462SApril.Chin@Sun.COM 816*8462SApril.Chin@Sun.COM if [[ "${options.filepattern.accept}" != "" ]] && \ 817*8462SApril.Chin@Sun.COM [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then 818*8462SApril.Chin@Sun.COM continue 819*8462SApril.Chin@Sun.COM fi 820*8462SApril.Chin@Sun.COM if [[ "${options.filepattern.reject}" != "" ]] && \ 821*8462SApril.Chin@Sun.COM [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then 822*8462SApril.Chin@Sun.COM continue 823*8462SApril.Chin@Sun.COM fi 824*8462SApril.Chin@Sun.COM 825*8462SApril.Chin@Sun.COM node.license_info_found=false 826*8462SApril.Chin@Sun.COM 827*8462SApril.Chin@Sun.COM if ! "${node.comments_parsed}" ; then 828*8462SApril.Chin@Sun.COM continue 829*8462SApril.Chin@Sun.COM fi 830*8462SApril.Chin@Sun.COM 831*8462SApril.Chin@Sun.COM for j in "${!node.comments[@]}" ; do 832*8462SApril.Chin@Sun.COM typeset s="${node.comments[$j]}" 833*8462SApril.Chin@Sun.COM typeset match=false 834*8462SApril.Chin@Sun.COM 835*8462SApril.Chin@Sun.COM if [[ "${options.commentpattern.accept}" != "" ]] && \ 836*8462SApril.Chin@Sun.COM [[ "$s" == ${options.commentpattern.accept} ]] ; then 837*8462SApril.Chin@Sun.COM match=true 838*8462SApril.Chin@Sun.COM fi 839*8462SApril.Chin@Sun.COM if [[ "${options.commentpattern.reject}" != "" ]] && \ 840*8462SApril.Chin@Sun.COM [[ "$s" == ${options.commentpattern.reject} ]] ; then 841*8462SApril.Chin@Sun.COM match=false 842*8462SApril.Chin@Sun.COM fi 843*8462SApril.Chin@Sun.COM 844*8462SApril.Chin@Sun.COM if "${match}" ; then 845*8462SApril.Chin@Sun.COM printf "\f#### filename='%s',\tcomment=%s\n" "${node.filename}" "$j" 846*8462SApril.Chin@Sun.COM printf "%s\n" "$s" 847*8462SApril.Chin@Sun.COM node.license_info_found=true 848*8462SApril.Chin@Sun.COM fi 849*8462SApril.Chin@Sun.COM done 850*8462SApril.Chin@Sun.COM 851*8462SApril.Chin@Sun.COM if ! "${node.license_info_found}" ; then 852*8462SApril.Chin@Sun.COM printf "## no match found in '%s'," "${node.filename}" 853*8462SApril.Chin@Sun.COM printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \ 854*8462SApril.Chin@Sun.COM "${node.comments_parsed}" \ 855*8462SApril.Chin@Sun.COM "${node.fileformat_found}" \ 856*8462SApril.Chin@Sun.COM "${node.file_format}" 857*8462SApril.Chin@Sun.COM fi 858*8462SApril.Chin@Sun.COM done 859*8462SApril.Chin@Sun.COM 860*8462SApril.Chin@Sun.COM return 0 861*8462SApril.Chin@Sun.COM} 862*8462SApril.Chin@Sun.COM 863*8462SApril.Chin@Sun.COMfunction print_comments_duplicates_compressed 864*8462SApril.Chin@Sun.COM{ 865*8462SApril.Chin@Sun.COM set -o errexit 866*8462SApril.Chin@Sun.COM 867*8462SApril.Chin@Sun.COM nameref records=$1 868*8462SApril.Chin@Sun.COM nameref options=$2 869*8462SApril.Chin@Sun.COM typeset i j 870*8462SApril.Chin@Sun.COM typeset -A hashed_comments 871*8462SApril.Chin@Sun.COM integer num_hashed_comments 872*8462SApril.Chin@Sun.COM 873*8462SApril.Chin@Sun.COM for i in $(printf "%s\n" "${!records[@]}" | sort) ; do 874*8462SApril.Chin@Sun.COM nameref node=records[$i] 875*8462SApril.Chin@Sun.COM 876*8462SApril.Chin@Sun.COM if [[ "${options.filepattern.accept}" != "" ]] && \ 877*8462SApril.Chin@Sun.COM [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then 878*8462SApril.Chin@Sun.COM continue 879*8462SApril.Chin@Sun.COM fi 880*8462SApril.Chin@Sun.COM if [[ "${options.filepattern.reject}" != "" ]] && \ 881*8462SApril.Chin@Sun.COM [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then 882*8462SApril.Chin@Sun.COM continue 883*8462SApril.Chin@Sun.COM fi 884*8462SApril.Chin@Sun.COM 885*8462SApril.Chin@Sun.COM node.license_info_found=false 886*8462SApril.Chin@Sun.COM 887*8462SApril.Chin@Sun.COM if ! "${node.comments_parsed}" ; then 888*8462SApril.Chin@Sun.COM continue 889*8462SApril.Chin@Sun.COM fi 890*8462SApril.Chin@Sun.COM 891*8462SApril.Chin@Sun.COM for j in "${!node.comments[@]}" ; do 892*8462SApril.Chin@Sun.COM typeset s="${node.comments[$j]}" 893*8462SApril.Chin@Sun.COM typeset match=false 894*8462SApril.Chin@Sun.COM 895*8462SApril.Chin@Sun.COM if [[ "${options.commentpattern.accept}" != "" ]] && \ 896*8462SApril.Chin@Sun.COM [[ "$s" == ${options.commentpattern.accept} ]] ; then 897*8462SApril.Chin@Sun.COM match=true 898*8462SApril.Chin@Sun.COM fi 899*8462SApril.Chin@Sun.COM if [[ "${options.commentpattern.reject}" != "" ]] && \ 900*8462SApril.Chin@Sun.COM [[ "$s" == ${options.commentpattern.reject} ]] ; then 901*8462SApril.Chin@Sun.COM match=false 902*8462SApril.Chin@Sun.COM fi 903*8462SApril.Chin@Sun.COM 904*8462SApril.Chin@Sun.COM 905*8462SApril.Chin@Sun.COM if "${match}" ; then 906*8462SApril.Chin@Sun.COM typeset -l hashstring # lowercase 907*8462SApril.Chin@Sun.COM 908*8462SApril.Chin@Sun.COM # compress the comment (e.g. convert whiteapces and '.,:;()"' to newline characters) ... 909*8462SApril.Chin@Sun.COM hashstring="${s//+([\n\r\t\v*#.,:;\(\)\"[:space:][:blank:]])/${ch.newline}}" 910*8462SApril.Chin@Sun.COM # ... and then create a MD5 hash from this string 911*8462SApril.Chin@Sun.COM hash="$(sum -x md5 <<<"${hashstring}")" 912*8462SApril.Chin@Sun.COM 913*8462SApril.Chin@Sun.COM nameref hc_node=hashed_comments[${hash}] 914*8462SApril.Chin@Sun.COM 915*8462SApril.Chin@Sun.COM if [[ "${hc_node}" == "" ]] ; then 916*8462SApril.Chin@Sun.COM # build node if there isn't one yet 917*8462SApril.Chin@Sun.COM typeset -a hc_node.fileids 918*8462SApril.Chin@Sun.COM typeset hc_node.comment="$s" 919*8462SApril.Chin@Sun.COM fi 920*8462SApril.Chin@Sun.COM 921*8462SApril.Chin@Sun.COM hc_node.fileids+=( "$(printf "%s (md5='%s', sha1='%s')\n" "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}")" ) 922*8462SApril.Chin@Sun.COM 923*8462SApril.Chin@Sun.COM node.license_info_found=true 924*8462SApril.Chin@Sun.COM fi 925*8462SApril.Chin@Sun.COM done 926*8462SApril.Chin@Sun.COM 927*8462SApril.Chin@Sun.COM if ! "${node.license_info_found}" ; then 928*8462SApril.Chin@Sun.COM printf "## no match found in " 929*8462SApril.Chin@Sun.COM printf "%s (md5='%s', sha1='%s'), " "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}" 930*8462SApril.Chin@Sun.COM printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \ 931*8462SApril.Chin@Sun.COM "${node.comments_parsed}" \ 932*8462SApril.Chin@Sun.COM "${node.fileformat_found}" \ 933*8462SApril.Chin@Sun.COM "${node.file_format}" 934*8462SApril.Chin@Sun.COM fi 935*8462SApril.Chin@Sun.COM done 936*8462SApril.Chin@Sun.COM 937*8462SApril.Chin@Sun.COM # print comments and all fileids (filename+hash sums) which include this comment 938*8462SApril.Chin@Sun.COM for i in "${!hashed_comments[@]}" ; do 939*8462SApril.Chin@Sun.COM printf "\f## The comment (ID=%s) ..." "${i}" 940*8462SApril.Chin@Sun.COM printf "\n-- snip --" 941*8462SApril.Chin@Sun.COM printf "\n%s" "${hashed_comments[${i}].comment}" 942*8462SApril.Chin@Sun.COM printf "\n-- snip --" 943*8462SApril.Chin@Sun.COM printf "\n... applies to the following files:\n" 944*8462SApril.Chin@Sun.COM printf "\t%s\n" "${hashed_comments[${i}].fileids[@]}" # printf repeats the format string for each array memeber 945*8462SApril.Chin@Sun.COM done 946*8462SApril.Chin@Sun.COM 947*8462SApril.Chin@Sun.COM return 0 948*8462SApril.Chin@Sun.COM} 949*8462SApril.Chin@Sun.COM 950*8462SApril.Chin@Sun.COMfunction do_crawl 951*8462SApril.Chin@Sun.COM{ 952*8462SApril.Chin@Sun.COM set -o errexit 953*8462SApril.Chin@Sun.COM 954*8462SApril.Chin@Sun.COM typeset options=( 955*8462SApril.Chin@Sun.COM integer max_filesize_for_scan=$((256*1024)) 956*8462SApril.Chin@Sun.COM integer max_num_comments=$((2**62)) # FIXME: This should be "+Inf" (=Infinite) 957*8462SApril.Chin@Sun.COM ) 958*8462SApril.Chin@Sun.COM 959*8462SApril.Chin@Sun.COM shift 960*8462SApril.Chin@Sun.COM while getopts -a "${progname}" "${do_crawl_usage}" OPT "$@" ; do 961*8462SApril.Chin@Sun.COM printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|" 962*8462SApril.Chin@Sun.COM case ${OPT} in 963*8462SApril.Chin@Sun.COM S) options.max_filesize_for_scan="${OPTARG}" ;; 964*8462SApril.Chin@Sun.COM N) options.max_num_comments="${OPTARG}" ;; 965*8462SApril.Chin@Sun.COM *) usage do_crawl_usage ;; 966*8462SApril.Chin@Sun.COM esac 967*8462SApril.Chin@Sun.COM done 968*8462SApril.Chin@Sun.COM shift $((OPTIND-1)) 969*8462SApril.Chin@Sun.COM 970*8462SApril.Chin@Sun.COM typeset scan=( 971*8462SApril.Chin@Sun.COM typeset -A records 972*8462SApril.Chin@Sun.COM ) 973*8462SApril.Chin@Sun.COM 974*8462SApril.Chin@Sun.COM # read filenames from stdin 975*8462SApril.Chin@Sun.COM while read i ; do 976*8462SApril.Chin@Sun.COM printf "## scanning %s ...\n" "$i" 977*8462SApril.Chin@Sun.COM extract_comments scan.records "$i" ${options.max_num_comments} ${options.max_filesize_for_scan} || true 978*8462SApril.Chin@Sun.COM done 979*8462SApril.Chin@Sun.COM 980*8462SApril.Chin@Sun.COM # print compound variable array (we strip the "typeset -A records" for now) 981*8462SApril.Chin@Sun.COM printf "%B\n" scan | 982*8462SApril.Chin@Sun.COM sed $'s/^#.*$//;s/^\(//;s/^\)//;s/^\ttypeset -A records=\(//;s/^\t\)//' >"crawlsrccomments_extracted_comments.cpv" 983*8462SApril.Chin@Sun.COM 984*8462SApril.Chin@Sun.COM print "# Wrote results to crawlsrccomments_extracted_comments.cpv" 985*8462SApril.Chin@Sun.COM 986*8462SApril.Chin@Sun.COM return 0 987*8462SApril.Chin@Sun.COM} 988*8462SApril.Chin@Sun.COM 989*8462SApril.Chin@Sun.COMfunction do_getcomments 990*8462SApril.Chin@Sun.COM{ 991*8462SApril.Chin@Sun.COM set -o errexit 992*8462SApril.Chin@Sun.COM 993*8462SApril.Chin@Sun.COM # vars 994*8462SApril.Chin@Sun.COM typeset scan=( 995*8462SApril.Chin@Sun.COM typeset -A records 996*8462SApril.Chin@Sun.COM ) 997*8462SApril.Chin@Sun.COM typeset database 998*8462SApril.Chin@Sun.COM typeset tmp 999*8462SApril.Chin@Sun.COM 1000*8462SApril.Chin@Sun.COM typeset options=( 1001*8462SApril.Chin@Sun.COM typeset database="crawlsrccomments_extracted_comments.cpv" 1002*8462SApril.Chin@Sun.COM 1003*8462SApril.Chin@Sun.COM typeset print_stats=false 1004*8462SApril.Chin@Sun.COM typeset zapduplicates=false 1005*8462SApril.Chin@Sun.COM typeset filepattern=( 1006*8462SApril.Chin@Sun.COM typeset accept="*" 1007*8462SApril.Chin@Sun.COM typeset reject="" 1008*8462SApril.Chin@Sun.COM ) 1009*8462SApril.Chin@Sun.COM typeset commentpattern=( 1010*8462SApril.Chin@Sun.COM typeset accept="~(Ei)(license|copyright)" 1011*8462SApril.Chin@Sun.COM typeset reject="" 1012*8462SApril.Chin@Sun.COM ) 1013*8462SApril.Chin@Sun.COM ) 1014*8462SApril.Chin@Sun.COM 1015*8462SApril.Chin@Sun.COM shift 1016*8462SApril.Chin@Sun.COM while getopts -a "${progname}" "${do_getcomments_usage}" OPT "$@" ; do 1017*8462SApril.Chin@Sun.COM # printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|" 1018*8462SApril.Chin@Sun.COM case ${OPT} in 1019*8462SApril.Chin@Sun.COM c) options.commentpattern.accept="${OPTARG}" ;; 1020*8462SApril.Chin@Sun.COM C) options.commentpattern.reject="${OPTARG}" ;; 1021*8462SApril.Chin@Sun.COM D) options.database="${OPTARG}" ;; 1022*8462SApril.Chin@Sun.COM l) options.filepattern.accept="${OPTARG}" ;; 1023*8462SApril.Chin@Sun.COM L) options.filepattern.reject="${OPTARG}" ;; 1024*8462SApril.Chin@Sun.COM S) options.print_stats=true ;; 1025*8462SApril.Chin@Sun.COM +S) options.print_stats=false ;; 1026*8462SApril.Chin@Sun.COM Z) options.zapduplicates=true ;; 1027*8462SApril.Chin@Sun.COM +Z) options.zapduplicates=false ;; 1028*8462SApril.Chin@Sun.COM *) usage do_getcomments_usage ;; 1029*8462SApril.Chin@Sun.COM esac 1030*8462SApril.Chin@Sun.COM done 1031*8462SApril.Chin@Sun.COM shift $((OPTIND-1)) 1032*8462SApril.Chin@Sun.COM 1033*8462SApril.Chin@Sun.COM # array of temporary files which should be cleaned-up upon exit 1034*8462SApril.Chin@Sun.COM typeset -a tmpfiles 1035*8462SApril.Chin@Sun.COM trap 'set -o errexit ; print -u2 "# Cleaning up..." ; ((${#tmpfiles[@]} > 0)) && rm -- "${tmpfiles[@]}" ; print -u2 "# Done."' EXIT 1036*8462SApril.Chin@Sun.COM 1037*8462SApril.Chin@Sun.COM # Support for HTTP URLs 1038*8462SApril.Chin@Sun.COM if [[ "${options.database}" == ~(El)http://.* ]] ; then 1039*8462SApril.Chin@Sun.COM database="/tmp/extract_license_cat_http_${PPID}_$$.tmp" 1040*8462SApril.Chin@Sun.COM tmpfiles+=( "${database}" ) 1041*8462SApril.Chin@Sun.COM print -u2 "# Loading URL..." 1042*8462SApril.Chin@Sun.COM cat_http "${options.database}" >"${database}" 1043*8462SApril.Chin@Sun.COM print -u2 "# Loading URL done." 1044*8462SApril.Chin@Sun.COM else 1045*8462SApril.Chin@Sun.COM database="${options.database}" 1046*8462SApril.Chin@Sun.COM fi 1047*8462SApril.Chin@Sun.COM 1048*8462SApril.Chin@Sun.COM if [[ ! -r "${database}" ]] ; then 1049*8462SApril.Chin@Sun.COM fatal_error "Can't read ${database}." 1050*8462SApril.Chin@Sun.COM fi 1051*8462SApril.Chin@Sun.COM 1052*8462SApril.Chin@Sun.COM # Support for compressed database files 1053*8462SApril.Chin@Sun.COM case "$(LC_ALL=C /usr/bin/file "${database}")" in 1054*8462SApril.Chin@Sun.COM *bzip2*) 1055*8462SApril.Chin@Sun.COM tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp" 1056*8462SApril.Chin@Sun.COM tmpfiles+=( "${tmp}" ) 1057*8462SApril.Chin@Sun.COM print -u2 "# Uncompressing data (bzip2) ..." 1058*8462SApril.Chin@Sun.COM bzcat <"${database}" >"${tmp}" 1059*8462SApril.Chin@Sun.COM print -u2 "# Uncompression done." 1060*8462SApril.Chin@Sun.COM database="${tmp}" 1061*8462SApril.Chin@Sun.COM ;; 1062*8462SApril.Chin@Sun.COM *gzip*) 1063*8462SApril.Chin@Sun.COM tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp" 1064*8462SApril.Chin@Sun.COM tmpfiles+=( "${tmp}" ) 1065*8462SApril.Chin@Sun.COM print -u2 "# Uncompressing data (gzip) ..." 1066*8462SApril.Chin@Sun.COM gunzip -c <"${database}" >"${tmp}" 1067*8462SApril.Chin@Sun.COM print -u2 "# Uncompression done." 1068*8462SApril.Chin@Sun.COM database="${tmp}" 1069*8462SApril.Chin@Sun.COM ;; 1070*8462SApril.Chin@Sun.COM esac 1071*8462SApril.Chin@Sun.COM 1072*8462SApril.Chin@Sun.COM # Read compound variable which contain all recorded comments 1073*8462SApril.Chin@Sun.COM print -u2 "# reading records..." 1074*8462SApril.Chin@Sun.COM { 1075*8462SApril.Chin@Sun.COM printf "(" 1076*8462SApril.Chin@Sun.COM cat "${database}" 1077*8462SApril.Chin@Sun.COM printf ")\n" 1078*8462SApril.Chin@Sun.COM } | read -C scan.records || fatal_error 'Error reading data.' 1079*8462SApril.Chin@Sun.COM print -u2 -f "# reading %d records done.\n" "${#scan.records[@]}" 1080*8462SApril.Chin@Sun.COM 1081*8462SApril.Chin@Sun.COM # print comments 1082*8462SApril.Chin@Sun.COM print -u2 "# processing data..." 1083*8462SApril.Chin@Sun.COM print "## comments start:" 1084*8462SApril.Chin@Sun.COM if "${options.zapduplicates}" ; then 1085*8462SApril.Chin@Sun.COM print_comments_duplicates_compressed scan.records options 1086*8462SApril.Chin@Sun.COM else 1087*8462SApril.Chin@Sun.COM print_comments_plain scan.records options 1088*8462SApril.Chin@Sun.COM fi 1089*8462SApril.Chin@Sun.COM print "## comments end" 1090*8462SApril.Chin@Sun.COM print -u2 "# processing data done." 1091*8462SApril.Chin@Sun.COM 1092*8462SApril.Chin@Sun.COM if "${options.print_stats}" ; then 1093*8462SApril.Chin@Sun.COM print_stats 1094*8462SApril.Chin@Sun.COM fi 1095*8462SApril.Chin@Sun.COM 1096*8462SApril.Chin@Sun.COM return 0 1097*8462SApril.Chin@Sun.COM} 1098*8462SApril.Chin@Sun.COM 1099*8462SApril.Chin@Sun.COMfunction usage 1100*8462SApril.Chin@Sun.COM{ 1101*8462SApril.Chin@Sun.COM nameref usagemsg=$1 1102*8462SApril.Chin@Sun.COM OPTIND=0 1103*8462SApril.Chin@Sun.COM getopts -a "${progname}" "${usagemsg}" OPT '-?' 1104*8462SApril.Chin@Sun.COM exit 2 1105*8462SApril.Chin@Sun.COM} 1106*8462SApril.Chin@Sun.COM 1107*8462SApril.Chin@Sun.COMtypeset -r do_getcomments_usage=$'+ 1108*8462SApril.Chin@Sun.COM[-?\n@(#)\$Id: getcomments (Roland Mainz) 2008-10-14 \$\n] 1109*8462SApril.Chin@Sun.COM[-author?Roland Mainz <roland.mainz@sun.com>] 1110*8462SApril.Chin@Sun.COM[+NAME?getcomments - extract license information from source files] 1111*8462SApril.Chin@Sun.COM[+DESCRIPTION?\bgetcomments\b is a small utilty script which extracts 1112*8462SApril.Chin@Sun.COM license information from the "\bgetcomments\b"-database 1113*8462SApril.Chin@Sun.COM file created by \bcrawl\b. The script allows various 1114*8462SApril.Chin@Sun.COM filters (see options below) to be applied on the database] 1115*8462SApril.Chin@Sun.COM[+?The license extraction is done in two steps - first a crawler script 1116*8462SApril.Chin@Sun.COM called \bcrawl\b will scan all source files, extract 1117*8462SApril.Chin@Sun.COM the comments and stores this information in a "database" file called 1118*8462SApril.Chin@Sun.COM "crawlsrccomments_extracted_comments.cpv" and then \bextract_license\b allows 1119*8462SApril.Chin@Sun.COM queries on this database.] 1120*8462SApril.Chin@Sun.COM[D:database?Database file for input (either file or http://-URL).]:[database] 1121*8462SApril.Chin@Sun.COM[l:acceptfilepattern?Process only files which match pattern.]:[pattern] 1122*8462SApril.Chin@Sun.COM[L:rejectfilepattern?Process only files which do not match pattern.]:[pattern] 1123*8462SApril.Chin@Sun.COM[c:acceptcommentpattern?Match comments which match pattern. Defaults to ~(Ei)(license|copyright)]:[pattern] 1124*8462SApril.Chin@Sun.COM[C:rejectcommentpattern?Discard comments which match pattern. Defaults to ""]:[pattern] 1125*8462SApril.Chin@Sun.COM[S:stats?Print statistics.] 1126*8462SApril.Chin@Sun.COM[Z:zapsimilar?Combine similar/duplicate comments in the report.] 1127*8462SApril.Chin@Sun.COM[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)] 1128*8462SApril.Chin@Sun.COM' 1129*8462SApril.Chin@Sun.COM 1130*8462SApril.Chin@Sun.COMtypeset -r do_crawl_usage=$'+ 1131*8462SApril.Chin@Sun.COM[-?\n@(#)\$Id: crawl (Roland Mainz) 2008-10-14 \$\n] 1132*8462SApril.Chin@Sun.COM[-author?Roland Mainz <roland.mainz@sun.com>] 1133*8462SApril.Chin@Sun.COM[+NAME?crawl - crawl comment information from source files] 1134*8462SApril.Chin@Sun.COM[+DESCRIPTION?\bcrawl\b is a small utilty script which reads 1135*8462SApril.Chin@Sun.COM a list of source code files from stdin, determinates the type of 1136*8462SApril.Chin@Sun.COM syntax used by these files and then extracts 1137*8462SApril.Chin@Sun.COM comments from the source code and stores this information into a 1138*8462SApril.Chin@Sun.COM "database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then 1139*8462SApril.Chin@Sun.COM be processed by \bextract_license\b or similar processing tools.] 1140*8462SApril.Chin@Sun.COM[S:scanmaxcharacters?Scan a maximum number of numchars characters for comments. 1141*8462SApril.Chin@Sun.COM Defaults to 256K characters.]:[numchars] 1142*8462SApril.Chin@Sun.COM[N:maxnumcomments?Maximum numbers of comments to crawl. Defaults to "+Infinite"]:[numcomments] 1143*8462SApril.Chin@Sun.COM[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)] 1144*8462SApril.Chin@Sun.COM' 1145*8462SApril.Chin@Sun.COM 1146*8462SApril.Chin@Sun.COMtypeset -r crawlsrccomments_usage=$'+ 1147*8462SApril.Chin@Sun.COM[-?\n@(#)\$Id: crawlsrccomments (Roland Mainz) 2008-10-14 \$\n] 1148*8462SApril.Chin@Sun.COM[-author?Roland Mainz <roland.mainz@sun.com>] 1149*8462SApril.Chin@Sun.COM[+NAME?crawlsrccomments - extract and filter comment information from source files] 1150*8462SApril.Chin@Sun.COM[+DESCRIPTION?\bcrawlsrccomments\b is a small utilty script which reads 1151*8462SApril.Chin@Sun.COM a list of source code files from stdin, determinates the type of 1152*8462SApril.Chin@Sun.COM syntax used by these files and then extracts 1153*8462SApril.Chin@Sun.COM comments from the source code and stores this information into a 1154*8462SApril.Chin@Sun.COM "database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then 1155*8462SApril.Chin@Sun.COM be processed by \bextract_license\b or similar processing tools.] 1156*8462SApril.Chin@Sun.COM 1157*8462SApril.Chin@Sun.COM[crawl|getcomments] options 1158*8462SApril.Chin@Sun.COM 1159*8462SApril.Chin@Sun.COM[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)] 1160*8462SApril.Chin@Sun.COM' 1161*8462SApril.Chin@Sun.COM 1162*8462SApril.Chin@Sun.COM 1163*8462SApril.Chin@Sun.COM# program start 1164*8462SApril.Chin@Sun.COMbuiltin basename 1165*8462SApril.Chin@Sun.COMbuiltin cat 1166*8462SApril.Chin@Sun.COMbuiltin date 1167*8462SApril.Chin@Sun.COMbuiltin uname 1168*8462SApril.Chin@Sun.COMbuiltin rm 1169*8462SApril.Chin@Sun.COMbuiltin sum || fatal_error "sum builtin not found." 1170*8462SApril.Chin@Sun.COM 1171*8462SApril.Chin@Sun.COM# exit at the first error we hit 1172*8462SApril.Chin@Sun.COMset -o errexit 1173*8462SApril.Chin@Sun.COM 1174*8462SApril.Chin@Sun.COMtypeset progname="${ basename "${0}" ; }" 1175*8462SApril.Chin@Sun.COM 1176*8462SApril.Chin@Sun.COMwhile getopts -a "${progname}" "${crawlsrccomments_usage}" OPT ; do 1177*8462SApril.Chin@Sun.COM # printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|" 1178*8462SApril.Chin@Sun.COM case ${OPT} in 1179*8462SApril.Chin@Sun.COM *) usage crawlsrccomments_usage ;; 1180*8462SApril.Chin@Sun.COM esac 1181*8462SApril.Chin@Sun.COMdone 1182*8462SApril.Chin@Sun.COMshift $((OPTIND-1)) 1183*8462SApril.Chin@Sun.COM 1184*8462SApril.Chin@Sun.COMtypeset cmd="$1" 1185*8462SApril.Chin@Sun.COM 1186*8462SApril.Chin@Sun.COMcase "$cmd" in 1187*8462SApril.Chin@Sun.COM "crawl") 1188*8462SApril.Chin@Sun.COM progname+=" ${cmd}" 1189*8462SApril.Chin@Sun.COM do_crawl "$@" 1190*8462SApril.Chin@Sun.COM exit $? 1191*8462SApril.Chin@Sun.COM ;; 1192*8462SApril.Chin@Sun.COM "getcomments") 1193*8462SApril.Chin@Sun.COM progname+=" ${cmd}" 1194*8462SApril.Chin@Sun.COM do_getcomments "$@" 1195*8462SApril.Chin@Sun.COM exit $? 1196*8462SApril.Chin@Sun.COM ;; 1197*8462SApril.Chin@Sun.COM *) 1198*8462SApril.Chin@Sun.COM usage crawlsrccomments_usage 1199*8462SApril.Chin@Sun.COM ;; 1200*8462SApril.Chin@Sun.COMesac 1201*8462SApril.Chin@Sun.COM 1202*8462SApril.Chin@Sun.COMfatal_error "not reached." 1203*8462SApril.Chin@Sun.COM# EOF. 1204