xref: /plan9/sys/src/cmd/aux/antiword/findtext.c (revision 25b329d522281a8cdd35da0dcc08c3fc621059a9)
1 /*
2  * findtext.c
3  * Copyright (C) 1998-2004 A.J. van Os; Released under GNU GPL
4  *
5  * Description:
6  * Find the blocks that contain the text of MS Word files
7  */
8 
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include "antiword.h"
12 
13 
14 /*
15  * bAddTextBlocks - Add the blocks to the text block list
16  *
17  * Returns TRUE when successful, FALSE if not
18  */
19 BOOL
bAddTextBlocks(ULONG ulCharPosFirst,ULONG ulTotalLength,BOOL bUsesUnicode,USHORT usPropMod,ULONG ulStartBlock,const ULONG * aulBBD,size_t tBBDLen)20 bAddTextBlocks(ULONG ulCharPosFirst, ULONG ulTotalLength,
21 	BOOL bUsesUnicode, USHORT usPropMod,
22 	ULONG ulStartBlock, const ULONG *aulBBD, size_t tBBDLen)
23 {
24 	text_block_type	tTextBlock;
25 	ULONG	ulCharPos, ulOffset, ulIndex;
26 	long	lToGo;
27 
28 	fail(ulTotalLength > (ULONG)LONG_MAX / 2);
29 	fail(ulStartBlock > MAX_BLOCKNUMBER && ulStartBlock != END_OF_CHAIN);
30 	fail(aulBBD == NULL);
31 
32 	NO_DBG_HEX(ulCharPosFirst);
33 	NO_DBG_DEC(ulTotalLength);
34 
35 	if (bUsesUnicode) {
36 		/* One character equals two bytes */
37 		NO_DBG_MSG("Uses Unicode");
38 		lToGo = (long)ulTotalLength * 2;
39 	} else {
40 		/* One character equals one byte */
41 		NO_DBG_MSG("Uses ASCII");
42 		lToGo = (long)ulTotalLength;
43 	}
44 
45 	ulCharPos = ulCharPosFirst;
46 	ulOffset = ulCharPosFirst;
47 	for (ulIndex = ulStartBlock;
48 	     ulIndex != END_OF_CHAIN && lToGo > 0;
49 	     ulIndex = aulBBD[ulIndex]) {
50 		if (ulIndex >= (ULONG)tBBDLen) {
51 			DBG_DEC(ulIndex);
52 			DBG_DEC(tBBDLen);
53 			werr(1, "The Big Block Depot is damaged");
54 		}
55 		if (ulOffset >= BIG_BLOCK_SIZE) {
56 			ulOffset -= BIG_BLOCK_SIZE;
57 			continue;
58 		}
59 		tTextBlock.ulFileOffset =
60 			(ulIndex + 1) * BIG_BLOCK_SIZE + ulOffset;
61 		tTextBlock.ulCharPos = ulCharPos;
62 		tTextBlock.ulLength = min(BIG_BLOCK_SIZE - ulOffset,
63 						(ULONG)lToGo);
64 		tTextBlock.bUsesUnicode = bUsesUnicode;
65 		tTextBlock.usPropMod = usPropMod;
66 		ulOffset = 0;
67 		if (!bAdd2TextBlockList(&tTextBlock)) {
68 			DBG_HEX(tTextBlock.ulFileOffset);
69 			DBG_HEX(tTextBlock.ulCharPos);
70 			DBG_DEC(tTextBlock.ulLength);
71 			DBG_DEC(tTextBlock.bUsesUnicode);
72 			DBG_DEC(tTextBlock.usPropMod);
73 			return FALSE;
74 		}
75 		ulCharPos += tTextBlock.ulLength;
76 		lToGo -= (long)tTextBlock.ulLength;
77 	}
78 	DBG_DEC_C(lToGo != 0, lToGo);
79 	return lToGo == 0;
80 } /* end of bAddTextBlocks */
81 
82 /*
83  * bGet6DocumentText - make a list of the text blocks of Word 6/7 files
84  *
85  * Code for "fast saved" files.
86  *
87  * Returns TRUE when successful, FALSE if not
88  */
89 BOOL
bGet6DocumentText(FILE * pFile,BOOL bUsesUnicode,ULONG ulStartBlock,const ULONG * aulBBD,size_t tBBDLen,const UCHAR * aucHeader)90 bGet6DocumentText(FILE *pFile, BOOL bUsesUnicode, ULONG ulStartBlock,
91 	const ULONG *aulBBD, size_t tBBDLen, const UCHAR *aucHeader)
92 {
93 	UCHAR	*aucBuffer;
94 	ULONG	ulBeginTextInfo, ulTextOffset, ulTotLength;
95 	size_t	tTextInfoLen;
96 	int	iIndex, iType, iOff, iLen, iPieces;
97 	USHORT	usPropMod;
98 
99 	DBG_MSG("bGet6DocumentText");
100 
101 	fail(pFile == NULL);
102 	fail(aulBBD == NULL);
103 	fail(aucHeader == NULL);
104 
105 	ulBeginTextInfo = ulGetLong(0x160, aucHeader);	/* fcClx */
106 	DBG_HEX(ulBeginTextInfo);
107 	tTextInfoLen = (size_t)ulGetLong(0x164, aucHeader);	/* lcbClx */
108 	DBG_DEC(tTextInfoLen);
109 
110 	aucBuffer = xmalloc(tTextInfoLen);
111 	if (!bReadBuffer(pFile, ulStartBlock,
112 			aulBBD, tBBDLen, BIG_BLOCK_SIZE,
113 			aucBuffer, ulBeginTextInfo, tTextInfoLen)) {
114 		aucBuffer = xfree(aucBuffer);
115 		return FALSE;
116 	}
117 	NO_DBG_PRINT_BLOCK(aucBuffer, tTextInfoLen);
118 
119 	iOff = 0;
120 	while ((size_t)iOff < tTextInfoLen) {
121 		iType = (int)ucGetByte(iOff, aucBuffer);
122 		iOff++;
123 		if (iType == 0) {
124 			DBG_FIXME();
125 			iOff++;
126 			continue;
127 		}
128 		if (iType == 1) {
129 			iLen = (int)usGetWord(iOff, aucBuffer);
130 			vAdd2PropModList(aucBuffer + iOff);
131 			iOff += iLen + 2;
132 			continue;
133 		}
134 		if (iType != 2) {
135 			werr(0, "Unknown type of 'fastsaved' format");
136 			aucBuffer = xfree(aucBuffer);
137 			return FALSE;
138 		}
139 		/* Type 2 */
140 		iLen = (int)usGetWord(iOff, aucBuffer);
141 		NO_DBG_DEC(iLen);
142 		iOff += 4;
143 		iPieces = (iLen - 4) / 12;
144 		DBG_DEC(iPieces);
145 		for (iIndex = 0; iIndex < iPieces; iIndex++) {
146 			ulTextOffset = ulGetLong(
147 				iOff + (iPieces + 1) * 4 + iIndex * 8 + 2,
148 				aucBuffer);
149 			usPropMod = usGetWord(
150 				iOff + (iPieces + 1) * 4 + iIndex * 8 + 6,
151 				aucBuffer);
152 			ulTotLength = ulGetLong(iOff + (iIndex + 1) * 4,
153 						aucBuffer) -
154 					ulGetLong(iOff + iIndex * 4,
155 						aucBuffer);
156 			NO_DBG_HEX_C(usPropMod != 0, usPropMod);
157 			if (!bAddTextBlocks(ulTextOffset, ulTotLength,
158 					bUsesUnicode, usPropMod,
159 					ulStartBlock,
160 					aulBBD, tBBDLen)) {
161 				aucBuffer = xfree(aucBuffer);
162 				return FALSE;
163 			}
164 		}
165 		break;
166 	}
167 	aucBuffer = xfree(aucBuffer);
168 	return TRUE;
169 } /* end of bGet6DocumentText */
170 
171 /*
172  * bGet8DocumentText - make a list of the text blocks of Word 8/97 files
173  *
174  * Returns TRUE when successful, FALSE if not
175  */
176 BOOL
bGet8DocumentText(FILE * pFile,const pps_info_type * pPPS,const ULONG * aulBBD,size_t tBBDLen,const ULONG * aulSBD,size_t tSBDLen,const UCHAR * aucHeader)177 bGet8DocumentText(FILE *pFile, const pps_info_type *pPPS,
178 	const ULONG *aulBBD, size_t tBBDLen,
179 	const ULONG *aulSBD, size_t tSBDLen,
180 	const UCHAR *aucHeader)
181 {
182 	const ULONG	*aulBlockDepot;
183 	UCHAR	*aucBuffer;
184 	ULONG	ulTextOffset, ulBeginTextInfo;
185 	ULONG	ulTotLength, ulLen;
186 	long	lIndex, lPieces, lOff;
187 	size_t	tTextInfoLen, tBlockDepotLen, tBlockSize;
188 	int	iType, iLen;
189 	BOOL	bUsesUnicode;
190 	USHORT	usPropMod;
191 
192 	DBG_MSG("bGet8DocumentText");
193 
194 	fail(pFile == NULL || pPPS == NULL);
195 	fail(aulBBD == NULL || aulSBD == NULL);
196 	fail(aucHeader == NULL);
197 
198   	ulBeginTextInfo = ulGetLong(0x1a2, aucHeader);	/* fcClx */
199 	DBG_HEX(ulBeginTextInfo);
200 	tTextInfoLen = (size_t)ulGetLong(0x1a6, aucHeader);	/* lcbClx */
201 	DBG_DEC(tTextInfoLen);
202 
203 	DBG_DEC(pPPS->tTable.ulSB);
204 	DBG_HEX(pPPS->tTable.ulSize);
205 	if (pPPS->tTable.ulSize == 0) {
206 		return FALSE;
207 	}
208 
209 	if (pPPS->tTable.ulSize < MIN_SIZE_FOR_BBD_USE) {
210 	  	/* Use the Small Block Depot */
211 		aulBlockDepot = aulSBD;
212 		tBlockDepotLen = tSBDLen;
213 		tBlockSize = SMALL_BLOCK_SIZE;
214 	} else {
215 	  	/* Use the Big Block Depot */
216 		aulBlockDepot = aulBBD;
217 		tBlockDepotLen = tBBDLen;
218 		tBlockSize = BIG_BLOCK_SIZE;
219 	}
220 	aucBuffer = xmalloc(tTextInfoLen);
221 	if (!bReadBuffer(pFile, pPPS->tTable.ulSB,
222 			aulBlockDepot, tBlockDepotLen, tBlockSize,
223 			aucBuffer, ulBeginTextInfo, tTextInfoLen)) {
224 		aucBuffer = xfree(aucBuffer);
225 		return FALSE;
226 	}
227 	NO_DBG_PRINT_BLOCK(aucBuffer, tTextInfoLen);
228 
229 	lOff = 0;
230 	while (lOff < (long)tTextInfoLen) {
231 		iType = (int)ucGetByte(lOff, aucBuffer);
232 		lOff++;
233 		if (iType == 0) {
234 			DBG_FIXME();
235 			lOff++;
236 			continue;
237 		}
238 		if (iType == 1) {
239 			iLen = (int)usGetWord(lOff, aucBuffer);
240 			vAdd2PropModList(aucBuffer + lOff);
241 			lOff += (long)iLen + 2;
242 			continue;
243 		}
244 		if (iType != 2) {
245 			werr(0, "Unknown type of 'fastsaved' format");
246 			aucBuffer = xfree(aucBuffer);
247 			return FALSE;
248 		}
249 		/* Type 2 */
250 		ulLen = ulGetLong(lOff, aucBuffer);
251 		if (ulLen < 4) {
252 			DBG_DEC(ulLen);
253 			return FALSE;
254 		}
255 		lOff += 4;
256 		lPieces = (long)((ulLen - 4) / 12);
257 		DBG_DEC(lPieces);
258 		for (lIndex = 0; lIndex < lPieces; lIndex++) {
259 			ulTextOffset = ulGetLong(
260 				lOff + (lPieces + 1) * 4 + lIndex * 8 + 2,
261 				aucBuffer);
262 			usPropMod = usGetWord(
263 				lOff + (lPieces + 1) * 4 + lIndex * 8 + 6,
264 				aucBuffer);
265 			ulTotLength = ulGetLong(lOff + (lIndex + 1) * 4,
266 						aucBuffer) -
267 					ulGetLong(lOff + lIndex * 4,
268 						aucBuffer);
269 			if ((ulTextOffset & BIT(30)) == 0) {
270 				bUsesUnicode = TRUE;
271 			} else {
272 				bUsesUnicode = FALSE;
273 				ulTextOffset &= ~BIT(30);
274 				ulTextOffset /= 2;
275 			}
276 			NO_DBG_HEX_C(usPropMod != 0, usPropMod);
277 			if (!bAddTextBlocks(ulTextOffset, ulTotLength,
278 					bUsesUnicode, usPropMod,
279 					pPPS->tWordDocument.ulSB,
280 					aulBBD, tBBDLen)) {
281 				aucBuffer = xfree(aucBuffer);
282 				return FALSE;
283 			}
284 		}
285 		break;
286 	}
287 	aucBuffer = xfree(aucBuffer);
288 	return TRUE;
289 } /* end of bGet8DocumentText */
290