unpremultiply-sse2.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329

;;;
;;; Copyright (c) 2009  M Joonas Pihlaja
;;;
;;; Permission is hereby granted, free of charge, to any person
;;; obtaining a copy of this software and associated documentation
;;; files (the "Software"), to deal in the Software without
;;; restriction, including without limitation the rights to use,
;;; copy, modify, merge, publish, distribute, sublicense, and/or sell
;;; copies of the Software, and to permit persons to whom the
;;; Software is furnished to do so, subject to the following
;;; conditions:
;;;
;;; The above copyright notice and this permission notice shall be
;;; included in all copies or substantial portions of the Software.
;;;
;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
;;; OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
;;; FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
;;; OTHER DEALINGS IN THE SOFTWARE.

;;;
;;; Unpremultiply routine for SSE2/AMD64.
;;;
;;; This file exports a function unpremultiply_with_sse2() that
;;; can be used to unpremultiply a contiguous buffer of 32 bit pixels.
;;;
;;; void unpremultiply_with_sse2(
;;;		uint32_t        *dst,
;;;		uint32_t const  *src,
;;;		unsigned long    num_pixels);
;;;
;;; Assembled with nasm 2.06rc2.
;;;
	section .text

; We're only using rax-rbp in this file so that
; conversion to 32 bit SSE2 would be easier by
; updating the register names and the
; argument extraction to the calling convention.

; Location of alpha in a 32 bit pixel.  Alpha measures opaqueness.
%define ASHIFT 24
;%define ASHIFT 0

;; Reciprocal table with 64 bit entries of 4 x 16 bit vectors
;; of the form
;;
;;  (1.0, 255/i, 255/i, 255/i)	 for ASHIFT=0
;;  (255/i, 255/i, 255/i, 1.0)	 for ASHIFT=24
;;
;; in 8.8 bit fixed point format.
	align	16
reciprocal_table_Q:
	dq	0
%assign i 1
%rep	255
%assign recip	255*256 / i
%if ASHIFT == 0
	dw	256, recip, recip, recip
%elif ASHIFT==24
	dw	recip, recip, recip, 256
%endif
%assign i i+1
%endrep

;; Reciprocal table with 32 bit entries of ceil(255/i) in
;; 16.16 bit fixed point.
reciprocal_table_D:
	dd	0
%assign i 1
%rep	255
%assign recip	(255*65536 + i-1) / i
	dd	recip
%assign i i+1
%endrep

unpremultiply_single_pixels:
;; Slower version for the odd pixels at the ends.
;;
;; In:
;;   uint32_t *dst/rdi:		Destination pixels.
;;   uint32_t *src/rsi:		Source pixels.
;;   num_pixels/rcx:		# pixels to unpremultiply.
;;
;; Out:
;;   rdi:			dst + 4*num_pixels; advanced past dst.
;;   rsi:			src + 4*num_pixels; advanced past src.
;;
;; Saved: rdx
;; Scratched: rax-rcx, rbp
	; Advance src/dst pointers to the end and setup iteration
	; from -num_pixels up to 0.
	lea	rsi, [rsi + rcx*4]
	lea	rdi, [rdi + rcx*4]
	neg	rcx
	jz	.out			; No pixels at all? -> .out

	push	rdx			; Save callee-save register.
.loop:
	; Load the next source pixel.
	mov	eax, [rsi + rcx*4]

%if ASHIFT == 24
	; Extract alpha and look up the reciprocal.
	mov	ebx, eax
	mov	ebp, eax		; Initialise result pixel register.
	and	ebp, 0xFF000000		; Mask off non-alpha from result pix.
;	jz	.next
	shr	ebx, 24			; Load alpha.
	mov	ebx, DWORD [reciprocal_table_D + 4*ebx] ; Load reciprocal.

	; Do the component from bits 0..7.
	mov	edx, eax
	and	edx, 255		; Extract the next component.
	shr	eax, 8			; Shift it out.
	imul	edx, ebx		; Divide for a result in 8.16 fixed pt.
	shr	edx, 16			; Truncate and move to bits 0..7.
	or	ebp, edx		; Merge into result pixel.

	; Do the component from bits 8..15.
	mov	edx, eax
	and	edx, 255		; Extract the next component.
	shr	eax, 8			; Shift it out.
	imul	edx, ebx		; Divide for a result in 8.16 fixed pt.
	and	edx, 0x00FF0000		; Truncate fraction.
	shr	edx, 8			; Move to bits 8..15.
	or	ebp, edx		; Merge into result pixel.

	; Do the component from bits 16..23.
	and	eax, 255		; Mask off alpha.
	imul	eax, ebx		; Divide for a result in 8.16 fixed pt.
	and	eax, 0x00FF0000		; Truncate fraction.
	or	ebp, eax		; Merge into result pixel.

%elif ASHIFT == 0
	; Extract alpha and loop up the reciprocal.
	mov	ebx, eax
	shr	eax, 8			; Shift out alpha.
	and	ebp, 255		; Mask off non-alpha.
	mov	ebx, ebp		; Initialise result pixel.
	jz	.next
	mov	ebx, DWORD [reciprocal_table_D + 4*ebx] ; Load reciprocal.

	; Do the component from bits 8..15.
	mov	edx, eax
	shr	eax, 8
	and	edx, 255
	imul	edx, ebx
	and	edx, 0x00FF0000
	shr	edx, 8
	or	ebp, edx

	; Do the component from bits 16..23
	mov	edx, eax
	shr	eax, 8
	and	edx, 255
	imul	edx, ebx
	and	edx, 0x00FF0000
	or	ebp, edx

	; Do the component from bits 24..31.
	imul	eax, ebx
	and	eax, 0x00FF0000
	shl	eax, 8
	or	ebp, eax
%endif
.next:
	; Write the result pixel.
	mov	[rdi + rcx*4], ebp

	inc	rcx
	jnz	.loop

	pop	rdx			; Restore callee-save reg.
.out:
	ret

%macro	unpremultiply_pixel_blocks 1
;; Faster version that does it in blocks of four pixels at a time.
;; The macro is parameterised on the instruction used to move
;; an XMM register to memory.
;;
;; In:
;;   uint32_t *src/rdi:	Destination pixels.
;;   uint32_t *dst/rsi:	Source pixels.
;;   num_pixels/rdx:	# pixels to unpremultiply.  Only
;;			 floor(num_pixels/4) will be.
;;
;;   %1:		Instruction used to write an xmm reg to dst.
;;
;; Out:
;;   rcx:		num_pixels mod 4 = # leftover pixels.
;;   rdi:		rdi + 16*floor(num_pixels/4); advanced past dst.
;;   rsi:		rsi + 16*floor(num_pixels/4); advanced past src.
;;
;; Scratched: xmm1-xmm4, rax-rdx, rbx
	; Advance the src and dst pointers to the end.  The bias
	; of +-15 is used to have the loop condition trigger an exit
	; just before we access the last incomplete block.
	shl	rdx, 2			; Size in bytes.
	lea	rsi, [rsi + rdx - 15]
	lea	rdi, [rdi + rdx - 15]
	neg	rdx
	add	rdx, 15			; Offset to the last byte of the
					;  first block from the end.
	jmp	%%test_cc
	align 16
%%loop:
	; Load four pixels into xmm1.  The prefetchnta here
	; hides the difference between movdqa vs. movdqu for
	; aligned input.
	prefetchnta	[rsi + rdx + 64*8] ; TODO: check the prefetch dist?
	movdqu	xmm1, [rsi + rdx]

	; Expand the 8 bit components into 16 bit ones in
	; two registers.
	movdqa	xmm2, xmm1
	punpckhbw xmm2, xmm2
	punpcklbw xmm1, xmm1

	; Load alphas into registers.
	movzx	eax, byte [rsi + rdx + ASHIFT/8 + 0]
	movzx	ebx, byte [rsi + rdx + ASHIFT/8 + 4]
	movzx	ecx, byte [rsi + rdx + ASHIFT/8 + 8]
	movzx	ebp, byte [rsi + rdx + ASHIFT/8 + 12]

	; Fetch multplier vectors for each pixel based on the alphas
	; into the xmm3/xmm4 registers.
	movq	xmm3, [reciprocal_table_Q + 8*eax]
	movq	xmm4, [reciprocal_table_Q + 8*ecx]
	movhpd	xmm3, [reciprocal_table_Q + 8*ebx]
	movhpd	xmm4, [reciprocal_table_Q + 8*ebp]

	; Do the unpremultiply in-place in the pixels in xmm1, xmm2.
	; Treating the components as 0.16 bit fixed point, the pmulhuw
	; leaves the integer part of x*255/a in the result for the colour
	; components x in (r,g,b) but leaves the alphas alone.
	pmulhuw	xmm1, xmm3
	pmulhuw	xmm2, xmm4

	; Pack the four resulting pixels from 16 to 8 bit components.
	; Here we saturate the result in case the input was superluminant.
	packuswb xmm1, xmm2

	; Write the result.
	%1	[rdi + rdx], xmm1

	; Increment to the next pixel. When this add overflows to >= 0
	; then the next read of a block would venture past the end of
	; the buffer.
	add	rdx, 16
%%test_cc:
	jnc	%%loop

	; Offset the pointers back to the last incomplete block.
	lea	rsi, [rsi + rdx]
	lea	rdi, [rdi + rdx]

	; Compute the # leftover pixels.
	lea	rcx, [rdx - 15]
	neg	rcx
	and	rcx, 15			; # bytes leftover.
	shr	rcx, 2			; # pixels leftover.
%endmacro

global unpremultiply_with_sse2
unpremultiply_with_sse2:
;;
;; void unpremultiply_with_sse2(
;;	uint32_t *dst/rdi,
;;	uint32_t const *src/rsi,
;;	ulong n/rdx);
;;
;; This is the main entry point callable from the outside.
;; The calling convention used here is the ELF64 one.
;;
	; Save callee-saved registers.
	push	rbp
	push	rbx

	; Save start of dst for alignment tests later.
	mov	rcx, rdi

	; If we don't have enough pixels for at least a few iterations
	; of blocked unpremultiplication then do the pixels one at a time.
	cmp	rdx, 3+4*4+3		; Max. pre/post align + 4 blocks.
	jae	.do_blocked
	 mov	rcx, rdx		; Pixel count.
	 call	unpremultiply_single_pixels
	 jmp	.out

.do_blocked:
	; If the destination pointer isn't even aligned to uint32_t
	; then we can't align it to 0 mod 16 using single pixels.
	test	rcx, 3
	jz	.can_align_dst
	 unpremultiply_pixel_blocks movdqu
	 jmp	.do_leftovers

.can_align_dst:
	; Align the destination pointer to 0 mod 16 by
	; doing 0..3 single pixels.
	neg	rcx
	and	rcx, 15			; # bytes to align to 16.
	shr	rcx, 2			; # pixels to align to 16.
	sub	rdx, rcx
	call	unpremultiply_single_pixels

	; If the source and dest are exactly aliased or
	; the image is fairly small then use movdqa writes.
	cmp	rdi, rsi		; Use movdqa for aliased src, dst.
	jz	.1
;	cmp	rdx, 128		; ... or if the src and dest are small.
;	jc	.1
	 unpremultiply_pixel_blocks movntdq
	 jmp	.do_leftovers
.1:
	 unpremultiply_pixel_blocks movdqa

.do_leftovers:
	call	unpremultiply_single_pixels
.out:
	pop	rbx
	pop	rbp
	ret