summaryrefslogtreecommitdiff
path: root/sal/inc/rtl/uri.h
blob: e1743393c2af6ae74076c75a298d5a5f16edb72f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
/*************************************************************************
 *
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 * 
 * Copyright 2008 by Sun Microsystems, Inc.
 *
 * OpenOffice.org - a multi-platform office productivity suite
 *
 * $RCSfile: uri.h,v $
 * $Revision: 1.12 $
 *
 * This file is part of OpenOffice.org.
 *
 * OpenOffice.org is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License version 3
 * only, as published by the Free Software Foundation.
 *
 * OpenOffice.org is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License version 3 for more details
 * (a copy is included in the LICENSE file that accompanied this code).
 *
 * You should have received a copy of the GNU Lesser General Public License
 * version 3 along with OpenOffice.org.  If not, see
 * <http://www.openoffice.org/license.html>
 * for a copy of the LGPLv3 License.
 *
 ************************************************************************/

#ifndef _RTL_URI_H_
#define _RTL_URI_H_

#include "rtl/textenc.h"
#include "rtl/ustring.h"
#include "sal/types.h"

#if defined __cplusplus
extern "C" {
#endif /* __cplusplus */

/**  Various predefined URI 'char classes.'

     @descr
     A 'char class' defines which (ASCII) characters can be written 'as they
     are' in a part of a Uri, and which characters have to be written using
     escape sequences ('%' followed by two hex digits).  Characters outside
     the ASCII range are always written using escape sequences.

     @descr
     If there are other frequently used char classes, they can be added to
     this enumeration; the function rtl_getUriCharClass() has to be adapted
     then, too.
 */
typedef enum
{
    /** The empty char class.

        @descr
        All characters are written using escape sequences.
     */
    rtl_UriCharClassNone,

    /** The RFC 2732 <uric> char class.

        @descr
        The 'valid' characters are !$&'()*+,-./:;=?@[]_~ plus digits and
        letters.
     */
    rtl_UriCharClassUric,

    /** The RFC 2396 <uric_no_slash> char class.

        @descr
        The 'valid' characters are !$&'()*+,-.:;=?@_~ plus digits and letters.
     */
    rtl_UriCharClassUricNoSlash,

    /** The RFC 2396 <rel_segment> char class.

        @descr
        The 'valid' characters are !$&'()*+,-.;=@_~ plus digits and letters.
     */
    rtl_UriCharClassRelSegment,

    /** The RFC 2396 <reg_name> char class.

        @descr
        The 'valid' characters are !$&'()*+,-.:;=@_~ plus digits and letters.
     */
    rtl_UriCharClassRegName,

    /** The RFC 2396 <userinfo> char class.

        @descr
        The 'valid' characters are !$&'()*+,-.:;=_~ plus digits and letters.
     */
    rtl_UriCharClassUserinfo,

    /** The RFC 2396 <pchar> char class.

        @descr
        The 'valid' characters are !$&'()*+,-.:=@_~ plus digits and letters.
     */
    rtl_UriCharClassPchar,

    /** The char class for the values of uno URL parameters.

        @descr
        The 'valid' characters are !$&'()*+-./:?@_~ plus digits and letters.
     */
    rtl_UriCharClassUnoParamValue,

    rtl_UriCharClass_FORCE_EQUAL_SIZE = SAL_MAX_ENUM
}
rtl_UriCharClass;

/** The mechanism describing how escape sequences in the input of
    rtl_uriEncode() are handled.
 */
typedef enum
{
    /** The special meaning of '%' is ignored (i.e., there are by definition
        no escape sequences in the input).

        @descr
        This mechanism is useful to encode user input as part of a URI (e.g.,
        the user-supplied password in an ftp URL---'%20abcde' is a valid
        password, so do not assume that the '%20' is an escaped space).
     */
    rtl_UriEncodeIgnoreEscapes,

    /** All escape sequences ('%' followed by two hex digits) are kept intact,
        even if they represent characters that need not be escaped or if they
        do not even map to characters in the given charset.

        @descr
        This mechanism is useful when passing on complete URIs more or less
        unmodified (e.g., within an HTTP proxy): missing escape sequences are
        added, but existing escape sequences are not touched (except that any
        lower case hex digits are replaced by upper case hex digits).
     */
    rtl_UriEncodeKeepEscapes,

    /** All escape sequences ('%' followed by two hex digits) are resolved in
        a first step; only those that represent characters that need to be
        escaped are kept intact.

        @descr
        This mechanism is useful to properly encode complete URIs entered by
        the user: the URI is brought into a 'canonic form,' but care is taken
        not to damage (valid) escape sequences the (careful) user already
        entered as such.
     */
    rtl_UriEncodeCheckEscapes,

    /** Like rtl_UriEncodeIgnoreEscapes, but indicating failure when converting
        unmappable characters.

        @since UDK 3.2.0
     */
    rtl_UriEncodeStrict,

    /** Like rtl_UriEncodeKeepEscapes, but indicating failure when converting
        unmappable characters.

        @since UDK 3.2.7
     */
    rtl_UriEncodeStrictKeepEscapes,

    rtl_UriEncode_FORCE_EQUAL_SIZE = SAL_MAX_ENUM
}
rtl_UriEncodeMechanism;

/** The mechanism describing how rtl_uriDecode() translates (part of) a URI
    into a Unicode string.
 */
typedef enum
{
    /** The text is returned completely unmodified.
     */
    rtl_UriDecodeNone,

    /** The text is returned in the form of an IURI (cf.
        draft-masinter-url-i18n-05.txt).

        @descr
        All escape sequences representing ASCII characters (%00--%7F) are
        kept, all other escape sequences are interpreted as UTF-8 characters
        and translated to Unicode, if possible.
     */
    rtl_UriDecodeToIuri,

    /** The text is decoded.

        @descr
        All escape sequences representing characters from the given charset
        are decoded and translated to Unicode, if possible.
     */
    rtl_UriDecodeWithCharset,

    /** Like rtl_UriDecodeWithCharset, but indicating failure when converting
        unmappable characters.

        @since UDK 3.2.0
     */
    rtl_UriDecodeStrict,

    rtl_UriDecode_FORCE_EQUAL_SIZE = SAL_MAX_ENUM
}
rtl_UriDecodeMechanism;

/** Map a predefined rtl_UriCharClass to a form usable by rtl_uriEncode().

    @descr
    The function rtl_uriEncode() expects an array of 128 booleans, and this
    function maps rtl_UriCharClass enumeration members to such arrays.

    @param eCharClass
    Any valid member of rtl_UriCharClass.

    @return
    An array of 128 booleans, to be used in calls to rtl_uriEncode().
 */
sal_Bool const * SAL_CALL rtl_getUriCharClass(rtl_UriCharClass eCharClass)
    SAL_THROW_EXTERN_C();

/** Encode a text as (part of) a URI.

    @param pText
    Any Unicode string.  Must not be null.

    @param pCharClass
    A char class, represented as an array of 128 booleans (true means keep the
    corresponding ASCII character unencoded, false means encode it).  Must not
    be null, and the boolean corresponding to the percent sign (0x25) must be
    false.  (See rtl_getUriCharClass() for a function mapping from
    rtl_UriCharClass to such arrays.)

    @param eMechanism
    The mechanism describing how escape sequences in the input text are
    handled.

    @param eCharset
    When Unicode characters from the input text have to be written using
    escape sequences (because they are either outside the ASCII range or do
    not belong to the given char class), they are first translated into this
    charset before being encoded using escape sequences.

    Also, if the encode mechanism is rtl_UriEncodeCheckEscapes, all escape
    sequences already present in the input text are interpreted as characters
    from this charset.

    @param pResult
    Returns an encoded representation of the input text.  Must itself not be
    null, and must point to either null or a valid string.

    If the encode mechanism is rtl_UriEncodeStrict, and pText cannot be
    converted to eCharset because it contains unmappable characters (which
    implies that pText is not empty), then an empty string is returned.
 */
void SAL_CALL rtl_uriEncode(rtl_uString * pText,
                            sal_Bool const * pCharClass,
                            rtl_UriEncodeMechanism eMechanism,
                            rtl_TextEncoding eCharset,
                            rtl_uString ** pResult)
    SAL_THROW_EXTERN_C();

/** Decode (a part of) a URI.

    @param pText
    Any Unicode string.  Must not be null.  (If the input is indeed part of a
    valid URI, this string will only contain a subset of the ASCII characters,
    but this function also handles other Unicode characters properly.)

    @param eMechanism
    The mechanism describing how the input text is translated into a Unicode
    string.

    @param eCharset
    When the decode mechanism is rtl_UriDecodeWithCharset, all escape
    sequences in the input text are interpreted as characters from this
    charset.  Those characters are translated to Unicode characters in the
    resulting output, if possible.

    When the decode mechanism is rtl_UriDecodeNone or rtl_UriDecodeToIuri,
    this parameter is ignored (and is best specified as
    RTL_TEXTENCODING_UTF8).

    @param pResult
    Returns a decoded representation of the input text.  Must itself not be
    null, and must point to either null or a valid string.

    If the decode mechanism is rtl_UriDecodeStrict, and pText cannot be
    converted to eCharset because it contains (encodings of) unmappable
    characters (which implies that pText is not empty), then an empty string is
    returned.
 */
void SAL_CALL rtl_uriDecode(rtl_uString * pText,
                            rtl_UriDecodeMechanism eMechanism,
                            rtl_TextEncoding eCharset,
                            rtl_uString ** pResult)
    SAL_THROW_EXTERN_C();

/** Convert a relative URI reference into an absolute one.

    A URI reference is a URI plus an optional <"#" fragment> part.

    This function uses the algorithm described in RFC 2396, section 5.2, with
    the following clarifications:  (1) Backwards-compatible relative URIs
    starting with a scheme component (see RFC 2396, section 5.2, step 3) are not
    supported.  (2) Segments "." and ".." within the path of the base URI are
    not considered special, RFC 2396 seems a bit unlcear about that point.
    (3) Erroneous excess segments ".." within the path of the relative URI (if
    it is indeed relative) are left intact, as the examples in RFC 2396,
    section C.2, suggest.  (4) If the relative URI is a reference to the
    "current document," the "current document" is taken to be the base URI.

    This function signals exceptions by returning false and letting pException
    point to a message explaining the exception.

    @param pBaseUriRef
    An absolute, hierarchical URI reference that serves as the base URI.  If it
    has to be inspected (i.e., pRelUriRef is not an absolute URI already), and
    if it either is not an absolute URI (i.e., does not begin with a
    <scheme ":"> part) or has a path that is non-empty but does not start
    with "/", an exception will be signaled.

    @param pRelUriRef
    An URI reference that may be either absolute or relative.  If it is
    absolute, it will be returned unmodified (and it need not be hierarchical
    then).

    @param pResult
    Returns an absolute URI reference.  Must itself not be null, and must point
    to either null or a valid string.  If an exception is signalled, it is left
    unchanged.

    @param pException
    Returns an explanatory message in case an exception is signalled.  Must
    itself not be null, and must point to either null or a valid string.  If no
    exception is signalled, it is left unchanged.

    @return
    True if no exception is signalled, otherwise false.
 */
sal_Bool SAL_CALL rtl_uriConvertRelToAbs(rtl_uString * pBaseUriRef,
                                         rtl_uString * pRelUriRef,
                                         rtl_uString ** pResult,
                                         rtl_uString ** pException)
    SAL_THROW_EXTERN_C();

#if defined __cplusplus
}
#endif /* __cplusplus */

#endif /* _RTL_URI_H_ */