1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
|
#!/usr/bin/env python
# coding=UTF-8
import ctypes
import sys
libfribidi = ctypes.CDLL("libfribidi.so")
# Character Types
class types:
# Define Masks
MASK_RTL = 0x00000001 # Is right to left
MASK_ARABIC = 0x00000002 # Is arabic
# Each char can be only one of the three following.
MASK_STRONG = 0x00000010 # Is strong
MASK_WEAK = 0x00000020 # Is weak
MASK_NEUTRAL = 0x00000040 # Is neutral
# Each char can be only one of the five following.
MASK_LETTER = 0x00000100 # Is letter: L, R, AL
MASK_NUMBER = 0x00000200 # Is number: EN, AN
MASK_NUMSEPTER = 0x00000400 # Is number separator or terminator: ES, ET, CS
MASK_SPACE = 0x00000800 # Is space: BN, BS, SS, WS
MASK_EXPLICIT = 0x00001000 # Is expilict mark: LRE, RLE, LRO, RLO, PDF
# Can be on only if MASK_SPACE is also on.
MASK_SEPARATOR = 0x00002000 # Is test separator: BS, SS
# Can be on only if MASK_EXPLICIT is also on.
MASK_OVERRIDE = 0x00004000 # Is explicit override: LRO, RLO
# The following must be to make types pairwise different, some of them can
# be removed but are here because of efficiency (make queries faster).
MASK_ES = 0x00010000
MASK_ET = 0x00020000
MASK_CS = 0x00040000
MASK_NSM = 0x00080000
MASK_BN = 0x00100000
MASK_BS = 0x00200000
MASK_SS = 0x00400000
MASK_WS = 0x00800000
# Define values for FriBidiCharType
LTR = (MASK_STRONG + MASK_LETTER) # Strong left to right
RTL = (MASK_STRONG + MASK_LETTER + MASK_RTL) # Right to left characters
AL = (MASK_STRONG + MASK_LETTER + MASK_RTL + MASK_ARABIC) # Arabic characters
LRE = (MASK_STRONG + MASK_EXPLICIT) # Left-To-Right embedding
RLE = (MASK_STRONG + MASK_EXPLICIT + MASK_RTL) # Right-To-Left embedding
LRO = (MASK_STRONG + MASK_EXPLICIT + MASK_OVERRIDE) # Left-To-Right override
RLO = (MASK_STRONG + MASK_EXPLICIT + MASK_RTL + MASK_OVERRIDE) # Right-To-Left override
PDF = (MASK_WEAK + MASK_EXPLICIT) # Pop directional override
EN = (MASK_WEAK + MASK_NUMBER) # European digit
AN = (MASK_WEAK + MASK_NUMBER + MASK_ARABIC) # Arabic digit
ES = (MASK_WEAK + MASK_NUMSEPTER + MASK_ES) # European number separator
ET = (MASK_WEAK + MASK_NUMSEPTER + MASK_ET) # European number terminator
CS = (MASK_WEAK + MASK_NUMSEPTER + MASK_CS) # Common Separator
NSM = (MASK_WEAK + MASK_NSM) # Non spacing mark
BN = (MASK_WEAK + MASK_SPACE + MASK_BN) # Boundary neutral
BS = (MASK_NEUTRAL + MASK_SPACE + MASK_SEPARATOR + MASK_BS) # Block separator
SS = (MASK_NEUTRAL + MASK_SPACE + MASK_SEPARATOR + MASK_SS) # Segment separator
WS = (MASK_NEUTRAL + MASK_SPACE + MASK_WS) # Whitespace
ON = (MASK_NEUTRAL) # Other Neutral
# Memory allocation functions
def _malloc_int_array (l):
"""
Returns a pointer to allocated C int array of length `l'
"""
t = ctypes.c_int * l
return t()
def _malloc_int8_array (l):
"""
Returns a pointer to allocated C int array of length `l'
"""
t = ctypes.c_int8 * l
return t()
def _malloc_utf8_array (l):
"""
Returns a pointer to allocated UTF8 (C char) array of length `l'
"""
t = ctypes.c_char * l
return t()
def _malloc_utf8_array_from_string (s):
"""
Returns a pointer to allocated UTF8 (C char) array, initialized with value of `s'
"""
return ctypes.c_char_p(s)
def _malloc_utc32_array (l):
"""
Returns a pointer to allocated UTC32 (C int32) array of length `l'
"""
t = ctypes.c_uint32 * l
return t()
# Unicode type convertors
def _pyunicode_to_utc32_p (a_pyunicode):
"""
Converts Python Unicode instance to UTC32 (C int32) array
"""
a_len = len(a_pyunicode)
#print 'a_len', a_len
utf8_pystr = a_pyunicode.encode('utf-8')
utf8_len = len(utf8_pystr)
utf8_p = _malloc_utf8_array_from_string(utf8_pystr)
#print 'utf8_p.value', utf8_p.value
#print 'utf8_len', utf8_len
utc32_p = _malloc_utc32_array(a_len+1)
libfribidi.fribidi_utf8_to_unicode (utf8_p, utf8_len, utc32_p)
#print 'utc32_p [%04x, %04x, %04x, %04x]' % (utc32_p[0], utc32_p[1], utc32_p[2], utc32_p[3])
# XX: Caller should free it!
return utc32_p
def _utc32_p_to_pyunicode (a_utc32_p):
"""
Converts UTC32 (C int32) array to Python Unicode instance
"""
#print 'a_utc32_p [%04x, %04x, %04x, %04x]' % (a_utc32_p[0], a_utc32_p[1], a_utc32_p[2], a_utc32_p[3])
utc32_len = ctypes.sizeof(a_utc32_p) / ctypes.sizeof(ctypes.c_uint32)
#print 'utc32_len', utc32_len
utf8_len = 6*utc32_len+1
utf8_p = _malloc_utf8_array(utf8_len)
libfribidi.fribidi_unicode_to_utf8 (a_utc32_p, utc32_len, utf8_p)
return utf8_p.value
# FriBidi API
def log2vis (input_pyunicode, input_pbase_dir, with_l2v_position=False, with_v2l_position=False, with_embedding_level=False):
input_len = len(input_pyunicode)
# memory allocations
input_utc32_p = _pyunicode_to_utc32_p(input_pyunicode)
pbase_dir = ctypes.c_int32(input_pbase_dir)
output_utc32_p = _malloc_utc32_array(input_len+1)
l2v_p = _malloc_int_array(input_len) if with_l2v_position else None
v2l_p = _malloc_int_array(input_len) if with_v2l_position else None
emb_p = _malloc_int8_array(input_len) if with_embedding_level else None
# calling fribidi_log2vis
successed = libfribidi.fribidi_log2vis(
# input
input_utc32_p,
input_len,
ctypes.pointer(pbase_dir),
# output
output_utc32_p,
l2v_p,
v2l_p,
emb_p
)
if not successed:
raise Exception('fribidi_log2vis failed')
# pythonizing the output
output_u = _utc32_p_to_pyunicode(output_utc32_p)
if with_l2v_position or with_v2l_position or with_embedding_level:
res = [output_u]
if with_l2v_position:
res.append([i for i in l2v_p])
if with_v2l_position:
res.append([i for i in v2l_p])
if with_embedding_level:
res.append([i for i in emb_p])
else:
res = output_u
return res
# Main
def _test ():
print log2vis(u"سلام", types.LTR, True, True, True)
print log2vis(u"سلام", types.RTL, True, True, True)
print log2vis(u"1سلام", types.LTR, True, True, True)
print log2vis(u"1سلام", types.RTL, True, True, True)
print log2vis(u"aسلام", types.LTR, True, True, True)
print log2vis(u"aسلام", types.RTL, True, True, True)
if __name__=='__main__':
_test()
|