summaryrefslogtreecommitdiff
path: root/XMPFiles/source/FormatSupport/XMPScanner.hpp
blob: cfdaaa3894d2333037585dd0b3373992fc0e55e3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
#ifndef __XMPScanner_hpp__
#define __XMPScanner_hpp__

// =================================================================================================
// Copyright 2004 Adobe
// All Rights Reserved.
//
// NOTICE:  Adobe permits you to use, modify, and distribute this file in accordance with the terms
// of the Adobe license agreement accompanying it. If you have received this file from a source other 
// than Adobe, then your use, modification, or distribution of it requires the prior written permission
// of Adobe.
//
// Adobe patent application tracking #P435, entitled 'Unique markers to simplify embedding data of
// one format in a file with a different format', inventors: Sean Parent, Greg Gilley.
// =================================================================================================

#include "public/include/XMP_Environment.h"	// ! This must be the first include.

#include <list>
#include <vector>
#include <string>
#include <memory>
#include <stdexcept>

#include "public/include/XMP_Const.h"

// =================================================================================================
// The XMPScanner class is used to scan a stream of input for XMP packets.  A scanner object is
// constructed then fed the input through a series of calls to Scan.  Report may be called at any
// time to get the current knowledge of the input.
//
// A packet starts when a valid header is found and ends when a valid trailer is found.  If the
// header contains a "bytes" attribute, additional whitespace must follow.
//
// *** RESTRICTIONS: The current implementation of the scanner has the the following restrictions:
//		- The input must be presented in order.
//		- Not fully thread safe, don't make concurrent calls to the same XMPScanner object.
// =================================================================================================

class XMPScanner {
public:

	// =============================================================================================
	// The entire input stream is represented as a series of snips.  Each snip defines one portion
	// of the input stream that either has not been seen, has been seen and contains no packets, is
	// exactly one packet, or contains the start of an unfinished packet.  Adjacent snips with the
	// same state are merged, so the number of snips is always minimal.
	//
	// A newly constructed XMPScanner object has one snip covering the whole input with a state
	// of "not seen".  A block of input that contains a full XMP packet is split into 3 parts: a
	// (possibly empty) raw input snip, the packet, and another (possibly empty) raw input snip.  A
	// block of input that contains the start of an XMP packet is split into two snips, a (possibly
	// empty) raw input snip and the packet start; the following snip must be a "not seen" snip.
	//
	// It is possible to have ill-formed packets.  These have a syntactically valid header and
	// trailer, but some semantic error.  For example, if the "bytes" attribute length does not span
	// to the end of the trailer, or if the following packet begins within trailing padding.
	
	enum {
		eNotSeenSnip,		// This snip has not been seen yet.
		ePendingSnip,		// This snip is an input buffer being processed.
		eRawInputSnip,		// This snip is raw input, it doesn't contain any part of an XMP packet.
		eValidPacketSnip,	// This snip is a complete, valid XMP packet.
		ePartialPacketSnip,	// This snip contains the start of a possible XMP packet.
		eBadPacketSnip		// This snip contains a complete, but semantically incorrect XMP packet.
	};
	typedef XMP_Uns8	SnipState;
	
	enum {	// The values allow easy testing for 16/32 bit and big/little endian.
		eChar8Bit			= 0,
		eChar16BitBig		= 2,
		eChar16BitLittle	= 3,
		eChar32BitBig		= 4,
		eChar32BitLittle	= 5
	};
	typedef XMP_Uns8	CharacterForm;

	enum {
		eChar16BitMask			= 2,	// These constant shouldn't be used directly, they are mainly
		eChar32BitMask			= 4,	// for the CharFormIsXyz macros below.
		eCharLittleEndianMask	= 1
	};
	
	#define CharFormIs16Bit(f)			( ((int)(f) & XMPScanner::eChar16BitMask) != 0 )
	#define CharFormIs32Bit(f)			( ((int)(f) & XMPScanner::eChar32BitMask) != 0 )
	
	#define CharFormIsBigEndian(f)		( ((int)(f) & XMPScanner::eCharLittleEndianMask) == 0 )
	#define CharFormIsLittleEndian(f)	( ((int)(f) & XMPScanner::eCharLittleEndianMask) != 0 )
	
	struct SnipInfo {

		XMP_Int64		fOffset;		// The byte offset of this snip within the input stream.
		XMP_Int64		fLength;		// The length in bytes of this snip.
		SnipState		fState;			// The state of this snip.
		bool			fOutOfOrder;	// If true, this snip was seen before the one in front of it.
		char			fAccess;		// The read-only/read-write access from the end attribute.
		CharacterForm	fCharForm;		// How the packet is divided into characters.
		const char *	fEncodingAttr;	// The value of the encoding attribute, if any, with nulls removed.
		XMP_Int64		fBytesAttr;		// The value of the bytes attribute, -1 if not present.

		SnipInfo() :
			fOffset ( 0 ),
			fLength ( 0 ),
			fState ( eNotSeenSnip ),
			fOutOfOrder ( false ),
			fAccess ( ' ' ),
			fCharForm ( eChar8Bit ),
			fEncodingAttr ( "" ),
			fBytesAttr( -1 )
		{ }

		SnipInfo ( SnipState state, XMP_Int64 offset, XMP_Int64 length ) :
			fOffset ( offset ),
			fLength ( length ),
			fState ( state ),
			fOutOfOrder ( false ),
			fAccess ( ' ' ),
			fCharForm ( eChar8Bit ),
			fEncodingAttr ( "" ),
			fBytesAttr( -1 )
		{ }
	
	};
	
	typedef std::vector<SnipInfo>	SnipInfoVector;

	XMPScanner ( XMP_Int64 streamLength );
	// Constructs a new XMPScanner object for a stream with the given length.
	
	~XMPScanner();

	long GetSnipCount();
	// Returns the number of snips that the stream has been divided into.
	
 	bool StreamAllScanned();
 	// Returns true if all of the stream has been seen.
	
	void Scan ( const void * bufferOrigin, XMP_Int64 bufferOffset, XMP_Int64 bufferLength );
	// Scans the given part of the input, incorporating it in to the known snips.
	// The bufferOffset is the offset of this block of input relative to the entire stream.
	// The bufferLength is the length in bytes of this block of input.

	void Report ( SnipInfoVector & snips );
	// Produces a report of what is known about the input stream. 

	class ScanError : public std::logic_error {
	public:
		ScanError() throw() : std::logic_error ( "" ) {}
		explicit ScanError ( const char * message ) throw() : std::logic_error ( message ) {}
		virtual ~ScanError() throw() {}
	};

private:	// XMPScanner
	
	class PacketMachine;

	class InternalSnip {
	public:

		SnipInfo	fInfo;							// The public info about this snip.
		std::auto_ptr<PacketMachine>	fMachine;	// The state machine for "active" snips.
		
		InternalSnip ( XMP_Int64 offset, XMP_Int64 length );
		InternalSnip ( const InternalSnip & );
		~InternalSnip ();

	};	// InternalSnip

	typedef std::list<InternalSnip>		InternalSnipList;
	typedef InternalSnipList::iterator	InternalSnipIterator;

	class PacketMachine {
	public:
		
		XMP_Int64		fPacketStart;	// Byte offset relative to the entire stream.
		XMP_Int32		fPacketLength;	// Length in bytes to the end of the trailer processing instruction.
		XMP_Int32		fBytesAttr;		// The value of the bytes attribute, -1 if not present.
		std::string		fEncodingAttr;	// The value of the encoding attribute, if any, with nulls removed.
		CharacterForm	fCharForm;		// How the packet is divided into characters.
		char			fAccess;		// The read-only/read-write access from the end attribute.
		bool			fBogusPacket;	// True if the packet has an error such as a bad "bytes" attribute value.
		
		void ResetMachine();

		enum TriState {
			eTriNo,
			eTriMaybe,
			eTriYes
		};

		TriState FindNextPacket();
		
		void AssociateBuffer ( XMP_Int64 bufferOffset, const void * bufferOrigin, XMP_Int64 bufferLength );
		
		PacketMachine ( XMP_Int64 bufferOffset, const void * bufferOrigin, XMP_Int64 bufferLength );
		~PacketMachine();
	
	private:	// PacketMachine
	
		PacketMachine() {};	// ! Hide the default constructor.
	
		enum RecognizerKind {

			eFailureRecognizer,			// Not really recognizers, special states to end one buffer's processing.
			eSuccessRecognizer,

			eLeadInRecognizer,			// Anything up to the next '<'.
			eHeadStartRecorder,			// Save the starting offset, count intervening nulls.
			eHeadStartRecognizer,		// The literal string "?xpacket begin=".

			eBOMRecognizer,				// Recognize and record the quoted byte order marker.

			eIDTagRecognizer,			// The literal string " id=".
			eIDOpenRecognizer,			// The opening quote for the ID.
			eIDValueRecognizer,			// The literal string "W5M0MpCehiHzreSzNTczkc9d".
			eIDCloseRecognizer,			// The closing quote for the ID.

			eAttrSpaceRecognizer_1, 	// The space before an attribute.
			eAttrNameRecognizer_1,		// The name of an attribute.
			eAttrValueRecognizer_1,		// The equal sign and quoted string value for an attribute.
			eAttrValueRecorder_1,		// Record the value of an attribute.

			eHeadEndRecognizer,			// The string literal "?>".			
			
			eBodyRecognizer,			// The packet body, anything up to the next '<'.

			eTailStartRecognizer,		// The string literal "?xpacket end=".
			eAccessValueRecognizer,		// Recognize and record the quoted r/w access mode.

			eAttrSpaceRecognizer_2, 	// The space before an attribute.
			eAttrNameRecognizer_2,		// The name of an attribute.
			eAttrValueRecognizer_2,		// The equal sign and quoted string value for an attribute.
			eAttrValueRecorder_2,		// Record the value of an attribute.

			eTailEndRecognizer,			// The string literal "?>".
			ePacketEndRecognizer,		// Look for trailing padding, check and record the packet size.
			eCloseOutRecognizer,		// Look for final nulls for little endian multibyte characters.
			
			eRecognizerCount

		};
		
		XMP_Int64		fBufferOffset;	// The offset of the data buffer within the input stream.
		const char *	fBufferOrigin;	// The starting address of the data buffer for this snip.
		const char *	fBufferPtr;		// The current postion in the data buffer.
		const char *	fBufferLimit;	// The address one past the last byte in the data buffer.

		RecognizerKind	fRecognizer;	// Which recognizer is currently active.
		signed long		fPosition;		// The internal position within a string literal, etc.
		unsigned char	fBytesPerChar;	// The number of bytes per logical character, 1, 2, or 4.
		unsigned char	fBufferOverrun;	// Non-zero if suspended while skipping intervening nulls.
		char			fQuoteChar;		// The kind of quote seen at the start of a quoted value.
		std::string		fAttrName;		// The name for an arbitrary attribute (other than "begin" and "id").
		std::string		fAttrValue;		// The value for an arbitrary attribute (other than "begin" and "id").
	
		void SetNextRecognizer ( RecognizerKind nextRecognizer );
				
		typedef TriState (* RecognizerProc) ( PacketMachine *, const char * );
	
		static TriState
		FindLessThan ( PacketMachine * ths, const char * which );
	
		static TriState
		MatchString ( PacketMachine * ths, const char * literal );
	
		static TriState
		MatchChar ( PacketMachine * ths, const char * literal );
	
		static TriState
		MatchOpenQuote ( PacketMachine * ths, const char * /* unused */ );
	
		static TriState
		MatchCloseQuote ( PacketMachine * ths, const char * /* unused */ );
	
		static TriState
		CaptureAttrName ( PacketMachine * ths, const char * /* unused */ );
	
		static TriState
		CaptureAttrValue ( PacketMachine * ths, const char * /* unused */ );
	
		static TriState
		RecordStart ( PacketMachine * ths, const char * /* unused */ );
	
		static TriState
		RecognizeBOM ( PacketMachine * ths, const char * /* unused */ );
	
		static TriState
		RecordHeadAttr ( PacketMachine * ths, const char * /* unused */ );
	
		static TriState
		CaptureAccess ( PacketMachine * ths, const char * /* unused */ );
	
		static TriState
		RecordTailAttr ( PacketMachine * ths, const char * /* unused */ );
	
		static TriState
		CheckPacketEnd ( PacketMachine * ths, const char * /* unused */ );
	
		static TriState
		CheckFinalNulls ( PacketMachine * ths, const char * /* unused */ );
		
		struct RecognizerInfo {
			RecognizerProc	proc;
			RecognizerKind	successNext;
			RecognizerKind	failureNext;
			const char *	literal;
		};
		
	};	// PacketMachine
	
	XMP_Int64			fStreamLength;
	InternalSnipList	fInternalSnips;

	void
	SplitInternalSnip ( InternalSnipIterator snipPos, XMP_Int64 relOffset, XMP_Int64 newLength );

	InternalSnipIterator
	MergeInternalSnips ( InternalSnipIterator firstPos, InternalSnipIterator secondPos );

	InternalSnipIterator
	PrevSnip ( InternalSnipIterator snipPos );

	InternalSnipIterator
	NextSnip ( InternalSnipIterator snipPos );

	#if DEBUG
		void DumpSnipList ( const char * title );
	#endif
	
};	// XMPScanner

#endif	// __XMPScanner_hpp__