1*8462SApril.Chin@Sun.COM#!/usr/bin/ksh93
2*8462SApril.Chin@Sun.COM
3*8462SApril.Chin@Sun.COM#
4*8462SApril.Chin@Sun.COM# CDDL HEADER START
5*8462SApril.Chin@Sun.COM#
6*8462SApril.Chin@Sun.COM# The contents of this file are subject to the terms of the
7*8462SApril.Chin@Sun.COM# Common Development and Distribution License (the "License").
8*8462SApril.Chin@Sun.COM# You may not use this file except in compliance with the License.
9*8462SApril.Chin@Sun.COM#
10*8462SApril.Chin@Sun.COM# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
11*8462SApril.Chin@Sun.COM# or http://www.opensolaris.org/os/licensing.
12*8462SApril.Chin@Sun.COM# See the License for the specific language governing permissions
13*8462SApril.Chin@Sun.COM# and limitations under the License.
14*8462SApril.Chin@Sun.COM#
15*8462SApril.Chin@Sun.COM# When distributing Covered Code, include this CDDL HEADER in each
16*8462SApril.Chin@Sun.COM# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
17*8462SApril.Chin@Sun.COM# If applicable, add the following below this CDDL HEADER, with the
18*8462SApril.Chin@Sun.COM# fields enclosed by brackets "[]" replaced with your own identifying
19*8462SApril.Chin@Sun.COM# information: Portions Copyright [yyyy] [name of copyright owner]
20*8462SApril.Chin@Sun.COM#
21*8462SApril.Chin@Sun.COM# CDDL HEADER END
22*8462SApril.Chin@Sun.COM#
23*8462SApril.Chin@Sun.COM
24*8462SApril.Chin@Sun.COM#
25*8462SApril.Chin@Sun.COM# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
26*8462SApril.Chin@Sun.COM# Use is subject to license terms.
27*8462SApril.Chin@Sun.COM#
28*8462SApril.Chin@Sun.COM
29*8462SApril.Chin@Sun.COM#
30*8462SApril.Chin@Sun.COM# rssread - a simple RSS2.0 reader with RSS to XHTML to
31*8462SApril.Chin@Sun.COM# plaintext conversion.
32*8462SApril.Chin@Sun.COM#
33*8462SApril.Chin@Sun.COM
34*8462SApril.Chin@Sun.COM# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant
35*8462SApril.Chin@Sun.COMexport PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin
36*8462SApril.Chin@Sun.COM
37*8462SApril.Chin@Sun.COMfunction printmsg
38*8462SApril.Chin@Sun.COM{
39*8462SApril.Chin@Sun.COM	print -u2 "$*"
40*8462SApril.Chin@Sun.COM}
41*8462SApril.Chin@Sun.COM
42*8462SApril.Chin@Sun.COMfunction debugmsg
43*8462SApril.Chin@Sun.COM{
44*8462SApril.Chin@Sun.COM#	printmsg "$*"
45*8462SApril.Chin@Sun.COMtrue
46*8462SApril.Chin@Sun.COM}
47*8462SApril.Chin@Sun.COM
48*8462SApril.Chin@Sun.COMfunction fatal_error
49*8462SApril.Chin@Sun.COM{
50*8462SApril.Chin@Sun.COM	print -u2 "${progname}: $*"
51*8462SApril.Chin@Sun.COM	exit 1
52*8462SApril.Chin@Sun.COM}
53*8462SApril.Chin@Sun.COM
54*8462SApril.Chin@Sun.COM# parse HTTP return code, cookies etc.
55*8462SApril.Chin@Sun.COMfunction parse_http_response
56*8462SApril.Chin@Sun.COM{
57*8462SApril.Chin@Sun.COM	nameref response="$1"
58*8462SApril.Chin@Sun.COM	typeset h statuscode statusmsg i
59*8462SApril.Chin@Sun.COM
60*8462SApril.Chin@Sun.COM	# we use '\r' as additional IFS to filter the final '\r'
61*8462SApril.Chin@Sun.COM	IFS=$' \t\r' read -r h statuscode statusmsg  # read HTTP/1.[01] <code>
62*8462SApril.Chin@Sun.COM	[[ "$h" != ~(Eil)HTTP/.* ]]         && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; }
63*8462SApril.Chin@Sun.COM	[[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n"  "$0" ; return 1 ; }
64*8462SApril.Chin@Sun.COM	response.statuscode="$statuscode"
65*8462SApril.Chin@Sun.COM	response.statusmsg="$statusmsg"
66*8462SApril.Chin@Sun.COM
67*8462SApril.Chin@Sun.COM	# skip remaining headers
68*8462SApril.Chin@Sun.COM	while IFS='' read -r i ; do
69*8462SApril.Chin@Sun.COM		[[ "$i" == $'\r' ]] && break
70*8462SApril.Chin@Sun.COM
71*8462SApril.Chin@Sun.COM		# strip '\r' at the end
72*8462SApril.Chin@Sun.COM		i="${i/~(Er)$'\r'/}"
73*8462SApril.Chin@Sun.COM
74*8462SApril.Chin@Sun.COM		case "$i" in
75*8462SApril.Chin@Sun.COM			~(Eli)Content-Type:.*)
76*8462SApril.Chin@Sun.COM				response.content_type="${i/~(El).*:[[:blank:]]*/}"
77*8462SApril.Chin@Sun.COM				;;
78*8462SApril.Chin@Sun.COM			~(Eli)Content-Length:[[:blank:]]*[0-9]*)
79*8462SApril.Chin@Sun.COM				integer response.content_length="${i/~(El).*:[[:blank:]]*/}"
80*8462SApril.Chin@Sun.COM				;;
81*8462SApril.Chin@Sun.COM			~(Eli)Transfer-Encoding:.*)
82*8462SApril.Chin@Sun.COM				response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}"
83*8462SApril.Chin@Sun.COM				;;
84*8462SApril.Chin@Sun.COM		esac
85*8462SApril.Chin@Sun.COM	done
86*8462SApril.Chin@Sun.COM
87*8462SApril.Chin@Sun.COM	return 0
88*8462SApril.Chin@Sun.COM}
89*8462SApril.Chin@Sun.COM
90*8462SApril.Chin@Sun.COMfunction cat_http_body
91*8462SApril.Chin@Sun.COM{
92*8462SApril.Chin@Sun.COM	typeset emode="$1"
93*8462SApril.Chin@Sun.COM	typeset hexchunksize="0"
94*8462SApril.Chin@Sun.COM	integer chunksize=0
95*8462SApril.Chin@Sun.COM
96*8462SApril.Chin@Sun.COM	if [[ "${emode}" == "chunked" ]] ; then
97*8462SApril.Chin@Sun.COM		while IFS=$'\r' read hexchunksize &&
98*8462SApril.Chin@Sun.COM			[[ "${hexchunksize}" == ~(Elri)[0-9abcdef]* ]] &&
99*8462SApril.Chin@Sun.COM			(( chunksize=16#${hexchunksize} )) && (( chunksize > 0 )) ; do
100*8462SApril.Chin@Sun.COM			dd bs=1 count="${chunksize}" 2>/dev/null
101*8462SApril.Chin@Sun.COM		done
102*8462SApril.Chin@Sun.COM	else
103*8462SApril.Chin@Sun.COM		cat
104*8462SApril.Chin@Sun.COM	fi
105*8462SApril.Chin@Sun.COM
106*8462SApril.Chin@Sun.COM	return 0
107*8462SApril.Chin@Sun.COM}
108*8462SApril.Chin@Sun.COM
109*8462SApril.Chin@Sun.COMfunction cat_http
110*8462SApril.Chin@Sun.COM{
111*8462SApril.Chin@Sun.COM	typeset protocol="${1%://*}"
112*8462SApril.Chin@Sun.COM	typeset path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html"
113*8462SApril.Chin@Sun.COM
114*8462SApril.Chin@Sun.COM	typeset host="${path1%%/*}"
115*8462SApril.Chin@Sun.COM	typeset path="${path1#*/}"
116*8462SApril.Chin@Sun.COM	typeset port="${host##*:}"
117*8462SApril.Chin@Sun.COM
118*8462SApril.Chin@Sun.COM	integer netfd
119*8462SApril.Chin@Sun.COM	typeset -C httpresponse # http response
120*8462SApril.Chin@Sun.COM
121*8462SApril.Chin@Sun.COM	# If URL did not contain a port number in the host part then look at the
122*8462SApril.Chin@Sun.COM	# protocol to get the port number
123*8462SApril.Chin@Sun.COM	if [[ "${port}" == "${host}" ]] ; then
124*8462SApril.Chin@Sun.COM		case "${protocol}" in
125*8462SApril.Chin@Sun.COM			"http") port=80 ;;
126*8462SApril.Chin@Sun.COM			*)      port="$(getent services "${protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;;
127*8462SApril.Chin@Sun.COM		esac
128*8462SApril.Chin@Sun.COM	else
129*8462SApril.Chin@Sun.COM		host="${host%:*}"
130*8462SApril.Chin@Sun.COM	fi
131*8462SApril.Chin@Sun.COM
132*8462SApril.Chin@Sun.COM	printmsg "protocol=${protocol} port=${port} host=${host} path=${path}"
133*8462SApril.Chin@Sun.COM
134*8462SApril.Chin@Sun.COM	# prechecks
135*8462SApril.Chin@Sun.COM	[[ "${protocol}" == "" ]] && { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; }
136*8462SApril.Chin@Sun.COM	[[ "${port}"     == "" ]] && { print -u2 -f "%s: port not set.\n"     "$0" ; return 1 ; }
137*8462SApril.Chin@Sun.COM	[[ "${host}"     == "" ]] && { print -u2 -f "%s: host not set.\n"     "$0" ; return 1 ; }
138*8462SApril.Chin@Sun.COM	[[ "${path}"     == "" ]] && { print -u2 -f "%s: path not set.\n"     "$0" ; return 1 ; }
139*8462SApril.Chin@Sun.COM
140*8462SApril.Chin@Sun.COM	# open TCP channel
141*8462SApril.Chin@Sun.COM	redirect {netfd}<>"/dev/tcp/${host}/${port}"
142*8462SApril.Chin@Sun.COM	(( $? != 0 )) && { print -u2 -f "%s: Couldn't open %s\n" "$0" "${1}" ; return 1 ; }
143*8462SApril.Chin@Sun.COM
144*8462SApril.Chin@Sun.COM	# send HTTP request
145*8462SApril.Chin@Sun.COM	request="GET /${path} HTTP/1.1\r\n"
146*8462SApril.Chin@Sun.COM	request+="Host: ${host}\r\n"
147*8462SApril.Chin@Sun.COM	request+="User-Agent: rssread/ksh93 (2008-10-14; $(uname -s -r -p))\r\n"
148*8462SApril.Chin@Sun.COM	request+="Connection: close\r\n"
149*8462SApril.Chin@Sun.COM	print -n -- "${request}\r\n" >&${netfd}
150*8462SApril.Chin@Sun.COM
151*8462SApril.Chin@Sun.COM	# collect response and send it to stdout
152*8462SApril.Chin@Sun.COM	parse_http_response httpresponse <&${netfd}
153*8462SApril.Chin@Sun.COM	cat_http_body "${httpresponse.transfer_encoding}" <&${netfd}
154*8462SApril.Chin@Sun.COM
155*8462SApril.Chin@Sun.COM	# close connection
156*8462SApril.Chin@Sun.COM	redirect {netfd}<&-
157*8462SApril.Chin@Sun.COM
158*8462SApril.Chin@Sun.COM	return 0
159*8462SApril.Chin@Sun.COM}
160*8462SApril.Chin@Sun.COM
161*8462SApril.Chin@Sun.COMfunction html_entity_to_ascii
162*8462SApril.Chin@Sun.COM{
163*8462SApril.Chin@Sun.COM	typeset buf
164*8462SApril.Chin@Sun.COM	typeset entity
165*8462SApril.Chin@Sun.COM	typeset c
166*8462SApril.Chin@Sun.COM	typeset value
167*8462SApril.Chin@Sun.COM
168*8462SApril.Chin@Sun.COM	# Todo: Add more HTML/MathML entities here
169*8462SApril.Chin@Sun.COM	# Note we use a static variable (typeset -S) here to make sure we
170*8462SApril.Chin@Sun.COM	# don't loose the cache data between calls
171*8462SApril.Chin@Sun.COM	typeset -S -A entity_cache=(
172*8462SApril.Chin@Sun.COM		# entity to ascii (fixme: add UTF-8 transliterations)
173*8462SApril.Chin@Sun.COM		["nbsp"]=' '
174*8462SApril.Chin@Sun.COM		["lt"]='<'
175*8462SApril.Chin@Sun.COM		["le"]='<='
176*8462SApril.Chin@Sun.COM		["gt"]='>'
177*8462SApril.Chin@Sun.COM		["ge"]='>='
178*8462SApril.Chin@Sun.COM		["amp"]='&'
179*8462SApril.Chin@Sun.COM		["quot"]='"'
180*8462SApril.Chin@Sun.COM		["apos"]="'"
181*8462SApril.Chin@Sun.COM	)
182*8462SApril.Chin@Sun.COM
183*8462SApril.Chin@Sun.COM	buf=""
184*8462SApril.Chin@Sun.COM	while IFS='' read -r -N 1 c ; do
185*8462SApril.Chin@Sun.COM		if [[ "$c" != "&" ]] ; then
186*8462SApril.Chin@Sun.COM			print -n -r -- "${c}"
187*8462SApril.Chin@Sun.COM			continue
188*8462SApril.Chin@Sun.COM		fi
189*8462SApril.Chin@Sun.COM
190*8462SApril.Chin@Sun.COM		entity=""
191*8462SApril.Chin@Sun.COM		while IFS='' read -r -N 1 c ; do
192*8462SApril.Chin@Sun.COM			case "$c" in
193*8462SApril.Chin@Sun.COM				";")
194*8462SApril.Chin@Sun.COM				break
195*8462SApril.Chin@Sun.COM				;;
196*8462SApril.Chin@Sun.COM			~(Eilr)[a-z0-9#])
197*8462SApril.Chin@Sun.COM				entity+="$c"
198*8462SApril.Chin@Sun.COM				continue
199*8462SApril.Chin@Sun.COM				;;
200*8462SApril.Chin@Sun.COM			*)
201*8462SApril.Chin@Sun.COM#				debugmsg "error &${entity}${c}#"
202*8462SApril.Chin@Sun.COM
203*8462SApril.Chin@Sun.COM				print -n -r -- "${entity}${c}"
204*8462SApril.Chin@Sun.COM				entity=""
205*8462SApril.Chin@Sun.COM				continue 2
206*8462SApril.Chin@Sun.COM				;;
207*8462SApril.Chin@Sun.COM			esac
208*8462SApril.Chin@Sun.COM		done
209*8462SApril.Chin@Sun.COM
210*8462SApril.Chin@Sun.COM		value=""
211*8462SApril.Chin@Sun.COM		if [[ "${entity_cache["${entity}"]}" != "" ]] ; then
212*8462SApril.Chin@Sun.COM#			debugmsg "match #${entity}# = #${entity_cache["${entity}"]}#"
213*8462SApril.Chin@Sun.COM			value="${entity_cache["${entity}"]}"
214*8462SApril.Chin@Sun.COM		else
215*8462SApril.Chin@Sun.COM			if [[ "${entity:0:1}" == "#" ]] ; then
216*8462SApril.Chin@Sun.COM				# decimal literal
217*8462SApril.Chin@Sun.COM				value="${ printf "\u[${ printf "%x" "${entity:1:8}" ; }]" ; }"
218*8462SApril.Chin@Sun.COM			elif [[ "${entity:0:7}" == ~(Eilr)[0-9a-f]* ]] ; then
219*8462SApril.Chin@Sun.COM				# hexadecimal literal
220*8462SApril.Chin@Sun.COM				value="${ printf "\u[${entity:0:7}]" ; }"
221*8462SApril.Chin@Sun.COM			else
222*8462SApril.Chin@Sun.COM				# unknown literal - pass-through
223*8462SApril.Chin@Sun.COM				value="ENT=|${entity}|"
224*8462SApril.Chin@Sun.COM			fi
225*8462SApril.Chin@Sun.COM
226*8462SApril.Chin@Sun.COM			entity_cache["${entity}"]="${value}"
227*8462SApril.Chin@Sun.COM
228*8462SApril.Chin@Sun.COM#			debugmsg "lookup #${entity}# = #${entity_cache["${entity}"]}#"
229*8462SApril.Chin@Sun.COM		fi
230*8462SApril.Chin@Sun.COM
231*8462SApril.Chin@Sun.COM		printf "%s" "${value}"
232*8462SApril.Chin@Sun.COM	done
233*8462SApril.Chin@Sun.COM
234*8462SApril.Chin@Sun.COM	return 0
235*8462SApril.Chin@Sun.COM}
236*8462SApril.Chin@Sun.COM
237*8462SApril.Chin@Sun.COM# dumb xhtml handler - no CSS,  tables, images, iframes or nested
238*8462SApril.Chin@Sun.COM# structures are supported (and we assume that the input is correct
239*8462SApril.Chin@Sun.COM# xhtml). The code was written in a trial&&error manner and should be
240*8462SApril.Chin@Sun.COM# rewritten to parse xhtml correctly.
241*8462SApril.Chin@Sun.COMfunction handle_html
242*8462SApril.Chin@Sun.COM{
243*8462SApril.Chin@Sun.COM    # we can't use global variables here when multiple callbacks use the same
244*8462SApril.Chin@Sun.COM    # callback function - but we can use the callback associative array for
245*8462SApril.Chin@Sun.COM    # variable storage instead
246*8462SApril.Chin@Sun.COM    nameref callbacks=${1}
247*8462SApril.Chin@Sun.COM    typeset tag_type="$2"
248*8462SApril.Chin@Sun.COM    typeset tag_value="$3"
249*8462SApril.Chin@Sun.COM
250*8462SApril.Chin@Sun.COM    case "${tag_type}" in
251*8462SApril.Chin@Sun.COM        tag_begin)
252*8462SApril.Chin@Sun.COM            case "${tag_value}" in
253*8462SApril.Chin@Sun.COM                br) printf "\n" ;;
254*8462SApril.Chin@Sun.COM                hr) printf "\n-------------------------------------\n" ;;
255*8462SApril.Chin@Sun.COM                pre) callbacks["html_pre"]='true' ;;
256*8462SApril.Chin@Sun.COM                p)  printf "\n" ;;
257*8462SApril.Chin@Sun.COM            esac
258*8462SApril.Chin@Sun.COM            ;;
259*8462SApril.Chin@Sun.COM
260*8462SApril.Chin@Sun.COM        tag_end)
261*8462SApril.Chin@Sun.COM            case "${tag_value}" in
262*8462SApril.Chin@Sun.COM                pre) callbacks["html_pre"]='false' ;;
263*8462SApril.Chin@Sun.COM            esac
264*8462SApril.Chin@Sun.COM            ;;
265*8462SApril.Chin@Sun.COM
266*8462SApril.Chin@Sun.COM        tag_text)
267*8462SApril.Chin@Sun.COM            if ${callbacks["html_pre"]} ; then
268*8462SApril.Chin@Sun.COM                printf "%s" "${tag_value}"
269*8462SApril.Chin@Sun.COM            else
270*8462SApril.Chin@Sun.COM                # compress spaces/newlines/tabs/etc.
271*8462SApril.Chin@Sun.COM                printf "%s" "${tag_value//+([\n\r\t\v[:space:][:blank:]])/ }"
272*8462SApril.Chin@Sun.COM            fi
273*8462SApril.Chin@Sun.COM            ;;
274*8462SApril.Chin@Sun.COM
275*8462SApril.Chin@Sun.COM        document_start)
276*8462SApril.Chin@Sun.COM            callbacks["html_pre"]='false'
277*8462SApril.Chin@Sun.COM            ;;
278*8462SApril.Chin@Sun.COM        document_end) ;;
279*8462SApril.Chin@Sun.COM    esac
280*8462SApril.Chin@Sun.COM
281*8462SApril.Chin@Sun.COM    return 0
282*8462SApril.Chin@Sun.COM}
283*8462SApril.Chin@Sun.COM
284*8462SApril.Chin@Sun.COMfunction handle_rss
285*8462SApril.Chin@Sun.COM{
286*8462SApril.Chin@Sun.COM	# we can't use global variables here when multiple callbacks use the same
287*8462SApril.Chin@Sun.COM	# callback function - but we can use the callback associative array for
288*8462SApril.Chin@Sun.COM	# variable storage instead
289*8462SApril.Chin@Sun.COM	nameref callbacks=${1}
290*8462SApril.Chin@Sun.COM	typeset tag_type="$2"
291*8462SApril.Chin@Sun.COM	typeset tag_value="$3"
292*8462SApril.Chin@Sun.COM
293*8462SApril.Chin@Sun.COM	case "${tag_type}" in
294*8462SApril.Chin@Sun.COM		tag_begin)
295*8462SApril.Chin@Sun.COM			case "${tag_value}" in
296*8462SApril.Chin@Sun.COM				item)
297*8462SApril.Chin@Sun.COM					item["title"]=""
298*8462SApril.Chin@Sun.COM					item["link"]=""
299*8462SApril.Chin@Sun.COM					item["tag"]=""
300*8462SApril.Chin@Sun.COM					item["description"]=""
301*8462SApril.Chin@Sun.COM					;;
302*8462SApril.Chin@Sun.COM			esac
303*8462SApril.Chin@Sun.COM			callbacks["textbuf"]=""
304*8462SApril.Chin@Sun.COM			;;
305*8462SApril.Chin@Sun.COM		tag_end)
306*8462SApril.Chin@Sun.COM			case "${tag_value}" in
307*8462SApril.Chin@Sun.COM				item)
308*8462SApril.Chin@Sun.COM					# note that each RSS item needs to be converted seperately from RSS to HTML to plain text
309*8462SApril.Chin@Sun.COM					# to make sure that the state of one RSS item doesn't affect others
310*8462SApril.Chin@Sun.COM					(
311*8462SApril.Chin@Sun.COM						printf $"<br />#### RSS item: title: %s ####" "${item["title"]}"
312*8462SApril.Chin@Sun.COM						printf $"<br />## author: %s" "${item["author"]}"
313*8462SApril.Chin@Sun.COM						printf $"<br />## link:   %s" "${item["link"]}"
314*8462SApril.Chin@Sun.COM						printf $"<br />## date:   %s" "${item["pubDate"]}"
315*8462SApril.Chin@Sun.COM						printf $"<br />## begin description:"
316*8462SApril.Chin@Sun.COM						printf $"<br />%s<br />" "${item["description"]}"
317*8462SApril.Chin@Sun.COM						printf $"<br />## end description<br />"
318*8462SApril.Chin@Sun.COM						print # extra newline to make sure the sed pipeline gets flushed
319*8462SApril.Chin@Sun.COM					) |
320*8462SApril.Chin@Sun.COM						html_entity_to_ascii |	# convert XML entities (e.g. decode RSS content to HTML code)
321*8462SApril.Chin@Sun.COM						xml_tok "xhtmltok_cb" |	# convert HTML to plain text
322*8462SApril.Chin@Sun.COM						html_entity_to_ascii	# convert HTML entities
323*8462SApril.Chin@Sun.COM					;;
324*8462SApril.Chin@Sun.COM				title)                item["title"]="${callbacks["textbuf"]}"        ; callbacks["textbuf"]="" ;;
325*8462SApril.Chin@Sun.COM				link)                 item["link"]="${callbacks["textbuf"]}"         ; callbacks["textbuf"]="" ;;
326*8462SApril.Chin@Sun.COM				dc:creator | author)  item["author"]="${callbacks["textbuf"]}"       ; callbacks["textbuf"]="" ;;
327*8462SApril.Chin@Sun.COM				dc:date | pubDate)    item["pubDate"]="${callbacks["textbuf"]}"      ; callbacks["textbuf"]="" ;;
328*8462SApril.Chin@Sun.COM				description)          item["description"]="${callbacks["textbuf"]}"  ; callbacks["textbuf"]="" ;;
329*8462SApril.Chin@Sun.COM			esac
330*8462SApril.Chin@Sun.COM			callbacks["textbuf"]=""
331*8462SApril.Chin@Sun.COM			;;
332*8462SApril.Chin@Sun.COM		tag_text)
333*8462SApril.Chin@Sun.COM			callbacks["textbuf"]+="${tag_value}"
334*8462SApril.Chin@Sun.COM			;;
335*8462SApril.Chin@Sun.COM		document_start) ;;
336*8462SApril.Chin@Sun.COM		document_end) ;;
337*8462SApril.Chin@Sun.COM	esac
338*8462SApril.Chin@Sun.COM	return 0
339*8462SApril.Chin@Sun.COM}
340*8462SApril.Chin@Sun.COM
341*8462SApril.Chin@Sun.COMfunction xml_tok
342*8462SApril.Chin@Sun.COM{
343*8462SApril.Chin@Sun.COM    typeset buf=""
344*8462SApril.Chin@Sun.COM    typeset namebuf=""
345*8462SApril.Chin@Sun.COM    typeset attrbuf=""
346*8462SApril.Chin@Sun.COM    typeset c=""
347*8462SApril.Chin@Sun.COM    typeset isendtag # bool: true/false
348*8462SApril.Chin@Sun.COM    typeset issingletag # bool: true/false (used for tags like "<br />")
349*8462SApril.Chin@Sun.COM    nameref callbacks=${1}
350*8462SApril.Chin@Sun.COM
351*8462SApril.Chin@Sun.COM    [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start"
352*8462SApril.Chin@Sun.COM
353*8462SApril.Chin@Sun.COM    while IFS='' read -r -N 1 c ; do
354*8462SApril.Chin@Sun.COM        isendtag=false
355*8462SApril.Chin@Sun.COM
356*8462SApril.Chin@Sun.COM        if [[ "$c" == "<" ]] ; then
357*8462SApril.Chin@Sun.COM	    # flush any text content
358*8462SApril.Chin@Sun.COM            if [[ "$buf" != "" ]] ; then
359*8462SApril.Chin@Sun.COM                [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf"
360*8462SApril.Chin@Sun.COM                buf=""
361*8462SApril.Chin@Sun.COM            fi
362*8462SApril.Chin@Sun.COM
363*8462SApril.Chin@Sun.COM            IFS='' read -r -N 1 c
364*8462SApril.Chin@Sun.COM            if [[ "$c" == "/" ]] ; then
365*8462SApril.Chin@Sun.COM                isendtag=true
366*8462SApril.Chin@Sun.COM            else
367*8462SApril.Chin@Sun.COM                buf="$c"
368*8462SApril.Chin@Sun.COM            fi
369*8462SApril.Chin@Sun.COM            IFS='' read -r -d '>' c
370*8462SApril.Chin@Sun.COM            buf+="$c"
371*8462SApril.Chin@Sun.COM
372*8462SApril.Chin@Sun.COM	    # handle comments
373*8462SApril.Chin@Sun.COM	    if [[ "$buf" == ~(El)!-- ]] ; then
374*8462SApril.Chin@Sun.COM	        # did we read the comment completely ?
375*8462SApril.Chin@Sun.COM	        if [[ "$buf" != ~(Elr)!--.*-- ]] ; then
376*8462SApril.Chin@Sun.COM		    buf+=">"
377*8462SApril.Chin@Sun.COM	            while [[ "$buf" != ~(Elr)!--.*-- ]] ; do
378*8462SApril.Chin@Sun.COM		        IFS='' read -r -N 1 c || break
379*8462SApril.Chin@Sun.COM		        buf+="$c"
380*8462SApril.Chin@Sun.COM		    done
381*8462SApril.Chin@Sun.COM		fi
382*8462SApril.Chin@Sun.COM
383*8462SApril.Chin@Sun.COM		[[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}"
384*8462SApril.Chin@Sun.COM		buf=""
385*8462SApril.Chin@Sun.COM		continue
386*8462SApril.Chin@Sun.COM	    fi
387*8462SApril.Chin@Sun.COM
388*8462SApril.Chin@Sun.COM	    # check if the tag starts and ends at the same time (like "<br />")
389*8462SApril.Chin@Sun.COM	    if [[ "${buf}" == ~(Er).*/ ]] ; then
390*8462SApril.Chin@Sun.COM	        issingletag=true
391*8462SApril.Chin@Sun.COM		buf="${buf%*/}"
392*8462SApril.Chin@Sun.COM	    else
393*8462SApril.Chin@Sun.COM	        issingletag=false
394*8462SApril.Chin@Sun.COM	    fi
395*8462SApril.Chin@Sun.COM
396*8462SApril.Chin@Sun.COM	    # check if the tag has attributes (e.g. space after name)
397*8462SApril.Chin@Sun.COM	    if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then
398*8462SApril.Chin@Sun.COM	        namebuf="${buf%%~(E)[[:space:][:blank:]].*}"
399*8462SApril.Chin@Sun.COM                attrbuf="${buf#~(E).*[[:space:][:blank:]]}"
400*8462SApril.Chin@Sun.COM            else
401*8462SApril.Chin@Sun.COM	        namebuf="$buf"
402*8462SApril.Chin@Sun.COM		attrbuf=""
403*8462SApril.Chin@Sun.COM	    fi
404*8462SApril.Chin@Sun.COM
405*8462SApril.Chin@Sun.COM            if ${isendtag} ; then
406*8462SApril.Chin@Sun.COM                [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
407*8462SApril.Chin@Sun.COM            else
408*8462SApril.Chin@Sun.COM                [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf"
409*8462SApril.Chin@Sun.COM
410*8462SApril.Chin@Sun.COM                # handle tags like <br/> (which are start- and end-tag in one piece)
411*8462SApril.Chin@Sun.COM                if ${issingletag} ; then
412*8462SApril.Chin@Sun.COM                    [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
413*8462SApril.Chin@Sun.COM                fi
414*8462SApril.Chin@Sun.COM            fi
415*8462SApril.Chin@Sun.COM            buf=""
416*8462SApril.Chin@Sun.COM        else
417*8462SApril.Chin@Sun.COM            buf+="$c"
418*8462SApril.Chin@Sun.COM        fi
419*8462SApril.Chin@Sun.COM    done
420*8462SApril.Chin@Sun.COM
421*8462SApril.Chin@Sun.COM    [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success"
422*8462SApril.Chin@Sun.COM
423*8462SApril.Chin@Sun.COM    print # final newline to make filters like "sed" happy
424*8462SApril.Chin@Sun.COM}
425*8462SApril.Chin@Sun.COM
426*8462SApril.Chin@Sun.COM# return the value of LC_MESSAGES needed for subprocesses which
427*8462SApril.Chin@Sun.COM# want to run in a different locale/encoding
428*8462SApril.Chin@Sun.COMfunction get_lc_messages
429*8462SApril.Chin@Sun.COM{
430*8462SApril.Chin@Sun.COM	[[ "${LC_ALL}"       != "" ]] && { print "${LC_ALL}"      ; return 0 ; }
431*8462SApril.Chin@Sun.COM	[[ "${LC_MESSAGES}"  != "" ]] && { print "${LC_MESSAGES}" ; return 0 ; }
432*8462SApril.Chin@Sun.COM	[[ "${LANG}"         != "" ]] && { print "${LANG}"        ; return 0 ; }
433*8462SApril.Chin@Sun.COM	print "C" ; return 0
434*8462SApril.Chin@Sun.COM}
435*8462SApril.Chin@Sun.COM
436*8462SApril.Chin@Sun.COMfunction do_rssread
437*8462SApril.Chin@Sun.COM{
438*8462SApril.Chin@Sun.COM	# set unicode locale since RSS is encoded in UTF-8
439*8462SApril.Chin@Sun.COM	# (and make sure $LC_MESSAGES is set to the parent
440*8462SApril.Chin@Sun.COM	# process's locale that all error messages are using
441*8462SApril.Chin@Sun.COM	# the callers locale/encoding)
442*8462SApril.Chin@Sun.COM	export \
443*8462SApril.Chin@Sun.COM		LC_MESSAGES="${ get_lc_messages ; }" \
444*8462SApril.Chin@Sun.COM		LC_MONETARY="en_US.UTF-8" \
445*8462SApril.Chin@Sun.COM		LC_NUMERIC="en_US.UTF-8" \
446*8462SApril.Chin@Sun.COM		LC_COLLATE="en_US.UTF-8" \
447*8462SApril.Chin@Sun.COM		LC_CTYPE="en_US.UTF-8" \
448*8462SApril.Chin@Sun.COM		LC_TIME="en_US.UTF-8" \
449*8462SApril.Chin@Sun.COM		LANG="en_US.UTF-8"
450*8462SApril.Chin@Sun.COM
451*8462SApril.Chin@Sun.COM	# need extra newline after cat_http to terminate line with $'\n'
452*8462SApril.Chin@Sun.COM	# to make "xml_tok" happy
453*8462SApril.Chin@Sun.COM	{ cat_http "$1" ; print ; } |
454*8462SApril.Chin@Sun.COM		xml_tok "rsstok_cb"
455*8462SApril.Chin@Sun.COM	return 0
456*8462SApril.Chin@Sun.COM}
457*8462SApril.Chin@Sun.COM
458*8462SApril.Chin@Sun.COMfunction usage
459*8462SApril.Chin@Sun.COM{
460*8462SApril.Chin@Sun.COM	OPTIND=0
461*8462SApril.Chin@Sun.COM	getopts -a "${progname}" "${rssread_usage}" OPT '-?'
462*8462SApril.Chin@Sun.COM	exit 2
463*8462SApril.Chin@Sun.COM}
464*8462SApril.Chin@Sun.COM
465*8462SApril.Chin@Sun.COM# make sure we use the ksh93 builtin versions
466*8462SApril.Chin@Sun.COMbuiltin basename
467*8462SApril.Chin@Sun.COMbuiltin cat
468*8462SApril.Chin@Sun.COM
469*8462SApril.Chin@Sun.COMtypeset -A rsstok_cb # callbacks for xml_tok
470*8462SApril.Chin@Sun.COMrsstok_cb["tag_begin"]="handle_rss"
471*8462SApril.Chin@Sun.COMrsstok_cb["tag_end"]="handle_rss"
472*8462SApril.Chin@Sun.COMrsstok_cb["tag_text"]="handle_rss"
473*8462SApril.Chin@Sun.COMrsstok_cb["textbuf"]=""
474*8462SApril.Chin@Sun.COM
475*8462SApril.Chin@Sun.COMtypeset -A xhtmltok_cb # callbacks for xml_tok
476*8462SApril.Chin@Sun.COMxhtmltok_cb["tag_begin"]="handle_html"
477*8462SApril.Chin@Sun.COMxhtmltok_cb["tag_end"]="handle_html"
478*8462SApril.Chin@Sun.COMxhtmltok_cb["tag_text"]="handle_html"
479*8462SApril.Chin@Sun.COMxhtmltok_cb["textbuf"]=""
480*8462SApril.Chin@Sun.COMxhtmltok_cb["html_pre"]='false'
481*8462SApril.Chin@Sun.COM
482*8462SApril.Chin@Sun.COMtypeset -A item
483*8462SApril.Chin@Sun.COM
484*8462SApril.Chin@Sun.COMtypeset -A bookmark_urls
485*8462SApril.Chin@Sun.COM
486*8462SApril.Chin@Sun.COM# "ramdom" urls for testing
487*8462SApril.Chin@Sun.COMbookmark_urls=(
488*8462SApril.Chin@Sun.COM	["google_blogs_ksh"]="http://blogsearch.google.com/blogsearch_feeds?hl=en&scoring=d&q=(%22ksh93%22%7C%22ksh+93%22+%7C+%22korn93%22+%7C+%22korn+93%22)&ie=utf-8&num=100&output=rss"
489*8462SApril.Chin@Sun.COM	# OpenSolaris.org sites
490*8462SApril.Chin@Sun.COM	["ksh93_integration"]="http://www.opensolaris.org/rss/os/project/ksh93-integration/announcements/rss2.xml"
491*8462SApril.Chin@Sun.COM	["shell"]="http://www.opensolaris.org/rss/os/project/shell/announcements/rss2.xml"
492*8462SApril.Chin@Sun.COM	["systemz"]="http://www.opensolaris.org/rss/os/project/systemz/announcements/rss2.xml"
493*8462SApril.Chin@Sun.COM	# some Sun staff/sites
494*8462SApril.Chin@Sun.COM	["blogs_sun_com"]="http://blogs.sun.com/main/feed/entries/rss"
495*8462SApril.Chin@Sun.COM	["bigadmin"]="http://www.sun.com/bigadmin/content/rss/motd.xml"
496*8462SApril.Chin@Sun.COM	["jmcp"]="http://www.jmcp.homeunix.com/roller/jmcp/feed/entries/rss"
497*8462SApril.Chin@Sun.COM	["katakai"]="http://blogs.sun.com/katakai/feed/entries/rss"
498*8462SApril.Chin@Sun.COM	["alanc"]="http://blogs.sun.com/alanc/feed/entries/rss"
499*8462SApril.Chin@Sun.COM	["planetsun"]="http://www.planetsun.org/rss20.xml"
500*8462SApril.Chin@Sun.COM	["planetsolaris"]="http://www.planetsolaris.org/rss20.xml"
501*8462SApril.Chin@Sun.COM	["planetopensolaris"]="http://planet.opensolaris.org/rss20.xml"
502*8462SApril.Chin@Sun.COM	["theregister_uk"]="http://www.theregister.co.uk/headlines.rss"
503*8462SApril.Chin@Sun.COM	["heise"]="http://www.heise.de/newsticker/heise.rdf"
504*8462SApril.Chin@Sun.COM	["slashdot"]="http://rss.slashdot.org/Slashdot/slashdot"
505*8462SApril.Chin@Sun.COM)
506*8462SApril.Chin@Sun.COM
507*8462SApril.Chin@Sun.COMtypeset progname="${ basename "${0}" ; }"
508*8462SApril.Chin@Sun.COM
509*8462SApril.Chin@Sun.COMtypeset -r rssread_usage=$'+
510*8462SApril.Chin@Sun.COM[-?\n@(#)\$Id: rssread (Roland Mainz) 2008-11-10 \$\n]
511*8462SApril.Chin@Sun.COM[-author?Roland Mainz <roland.mainz@sun.com>]
512*8462SApril.Chin@Sun.COM[-author?Roland Mainz <roland.mainz@nrubsig.org>]
513*8462SApril.Chin@Sun.COM[+NAME?rssread - fetch RSS messages and convert them to plain text]
514*8462SApril.Chin@Sun.COM[+DESCRIPTION?\brssread\b RSS to plain text converter
515*8462SApril.Chin@Sun.COM        which fetches RSS streams via HTTP and converts them from
516*8462SApril.Chin@Sun.COM	RSS to HTML to plain text in the current locale/encoding.]
517*8462SApril.Chin@Sun.COM[I:noiconv?Do not convert data from UTF-8 to current locale/encoding.]
518*8462SApril.Chin@Sun.COM
519*8462SApril.Chin@Sun.COM[ url ]
520*8462SApril.Chin@Sun.COM
521*8462SApril.Chin@Sun.COM[+SEE ALSO?\bksh93\b(1), \bshnote\b(1)]
522*8462SApril.Chin@Sun.COM'
523*8462SApril.Chin@Sun.COM
524*8462SApril.Chin@Sun.COMtypeset noiconv=false
525*8462SApril.Chin@Sun.COM
526*8462SApril.Chin@Sun.COMwhile getopts -a "${progname}" "${rssread_usage}" OPT ; do
527*8462SApril.Chin@Sun.COM#	printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
528*8462SApril.Chin@Sun.COM	case ${OPT} in
529*8462SApril.Chin@Sun.COM		I)    noiconv=true  ;;
530*8462SApril.Chin@Sun.COM		+I)   noiconv=false ;;
531*8462SApril.Chin@Sun.COM		*)    usage ;;
532*8462SApril.Chin@Sun.COM	esac
533*8462SApril.Chin@Sun.COMdone
534*8462SApril.Chin@Sun.COMshift $((OPTIND-1))
535*8462SApril.Chin@Sun.COM
536*8462SApril.Chin@Sun.COMtypeset url="$1"
537*8462SApril.Chin@Sun.COM
538*8462SApril.Chin@Sun.COMif [[ "${url}" == "" ]] ; then
539*8462SApril.Chin@Sun.COM	fatal_error $"No url given."
540*8462SApril.Chin@Sun.COMfi
541*8462SApril.Chin@Sun.COM
542*8462SApril.Chin@Sun.COMif [[ "${bookmark_urls[${url}]}" != "" ]] ; then
543*8462SApril.Chin@Sun.COM	printmsg $"Using bookmark ${url} = ${bookmark_urls[${url}]}"
544*8462SApril.Chin@Sun.COM	url="${bookmark_urls[${url}]}"
545*8462SApril.Chin@Sun.COMfi
546*8462SApril.Chin@Sun.COM
547*8462SApril.Chin@Sun.COMif ${noiconv} ; then
548*8462SApril.Chin@Sun.COM	do_rssread "${url}"
549*8462SApril.Chin@Sun.COMelse
550*8462SApril.Chin@Sun.COM	do_rssread "${url}" | iconv -f "UTF-8" - -
551*8462SApril.Chin@Sun.COMfi
552*8462SApril.Chin@Sun.COM
553*8462SApril.Chin@Sun.COMexit 0
554*8462SApril.Chin@Sun.COM#EOF.
555