;-------------------------------------------------------------------------
; cxm12161 -- This function performs YUV12-to-RGB16 color conversion for H26x.
;             It handles any format in which there are three fields, the low
;             order field being B and fully contained in the low order byte, the
;             second field being G and being somewhere in bits 4 through 11,
;             and the high order field being R and fully contained in the high
;             order byte.
;
;             The YUV12 input is planar, 8 bits per pel.  The Y plane may have
;             a pitch of up to 768.  It may have a width less than or equal
;             to the pitch.  It must be DWORD aligned, and preferably QWORD
;             aligned.  Pitch and Width must be a multiple of four.  For best
;             performance, Pitch should not be 4 more than a multiple of 32.
;             Height may be any amount, but must be a multiple of two.  The U
;             and V planes may have a different pitch than the Y plane, subject
;             to the same limitations.
;

;include iammx.inc
;include locals.inc


RGB_formats:
    dd  RGB565
    dd  RGB555
    dd  RGB664
    dd  RGB655

Minusg:             dd   00800080h, 00800080h
Yadd:               dd   10101010h, 10101010h
VtR:                dd   00660066h, 00660066h ;01990199h,01990199h
VtG:                dd   00340034h, 00340034h ;00d000d0h,00d000d0h
UtG:                dd   00190019h, 00190019h ;00640064h,00640064h
UtB:                dd   00810081h, 00810081h ;02050205h,02050205h
Ymul:               dd   004a004ah, 004a004ah ;012a012ah,012a012ah
UVtG:               dd   00340019h, 00340019h ;00d00064h,00d00064h
VtRUtB:             dd   01990205h, 01990205h
fourbitu:           dd  0f0f0f0f0h, 0f0f0f0f0h
fivebitu:           dd  0e0e0e0e0h, 0e0e0e0e0h
sixbitu:            dd  0c0c0c0c0h, 0c0c0c0c0h

%assign LocalFrameSize  156
%assign RegisterStorageSize  16

; Arguments:
%assign YPlane                    LocalFrameSize + RegisterStorageSize +  4
%assign UPlane                    LocalFrameSize + RegisterStorageSize +  8
%assign VPlane                    LocalFrameSize + RegisterStorageSize + 12
%assign FrameWidth                LocalFrameSize + RegisterStorageSize + 16
%assign FrameHeight               LocalFrameSize + RegisterStorageSize + 20
%assign YPitch                    LocalFrameSize + RegisterStorageSize + 24
%assign ChromaPitch               LocalFrameSize + RegisterStorageSize + 28
%assign AspectAdjustmentCount     LocalFrameSize + RegisterStorageSize + 32
%assign ColorConvertedFrame       LocalFrameSize + RegisterStorageSize + 36
%assign DCIOffset                 LocalFrameSize + RegisterStorageSize + 40
%assign CCOffsetToLine0           LocalFrameSize + RegisterStorageSize + 44
%assign CCOPitch                  LocalFrameSize + RegisterStorageSize + 48
%assign CCType                    LocalFrameSize + RegisterStorageSize + 52
%assign EndOfArgList              LocalFrameSize + RegisterStorageSize + 56

; Locals (on local stack frame)
%assign CCOCursor                   0
%assign CCOSkipDistance             4
%assign ChromaLineLen               8
%assign YCursor                    12
%assign DistanceFromVToU           16
%assign EndOfChromaLine            20
%assign AspectCount                24
%assign AspectBaseCount            28
%assign tmpYCursorEven             32
%assign tmpYCursorOdd              36
%assign tmpCCOPitch                40
%assign temp_mmx                   44  ; note it is 48 bytes
%assign RLeftShift                 92
%assign GLeftShift                100
%assign RRightShift               108
%assign GRightShift               116
%assign BRightShift               124
%assign RUpperLimit               132
%assign GUpperLimit               140
%assign BUpperLimit               148


; extern void C MMX_YUV12ToRGB16 (
;                                     U8* YPlane,
;                                     U8* UPlane,
;                                     U8* VPlane,
;                                     UN  FrameWidth,
;                                     UN  FrameHeight,
;                                     UN  YPitch,
;                                     UN  VPitch,
;                                     UN  AspectAdjustmentCount,
;                                     U8* ColorConvertedFrame,
;                                     U32 DCIOffset,
;                                     U32 CCOffsetToLine0,
;                                     IN  CCOPitch,
;                                     IN  CCType)
;
;  The local variables are on the stack,
;  The tables are in the one and only data segment.
;
;  CCOffsetToLine0 is relative to ColorConvertedFrame.
;  CCType  used by RGB color convertors to determine the exact conversion type.
;    RGB565 = 0 
;    RGB555 = 1
;    RGB664 = 2
;    RGB655 = 3

global yuv_2_rgb

yuv_2_rgb:
  push       esi
  push       edi

  push       ebp
  push       ebx

  sub        esp, LocalFrameSize
  mov        eax, [esp+CCType]
  cmp        eax,4
  jae        near finish

  jmp        [RGB_formats+eax*4]

RGB555:
  xor        eax, eax
  mov        ebx, 2   ; 10-8 for byte shift
  mov        [esp+RLeftShift], ebx
  mov        [esp+RLeftShift+4], eax
  mov        ebx, 5
  mov        [esp+GLeftShift], ebx
  mov        [esp+GLeftShift+4], eax
  mov        ebx, 9
  mov        [esp+RRightShift], ebx
  mov        [esp+RRightShift+4], eax
  mov        [esp+GRightShift], ebx
  mov        [esp+GRightShift+4], eax
  mov        [esp+BRightShift], ebx
  mov        [esp+BRightShift+4], eax
  movq       mm0, [fivebitu]
  movq       [esp+RUpperLimit], mm0
  movq       [esp+GUpperLimit], mm0
  movq       [esp+BUpperLimit], mm0
  jmp        RGBEND

RGB664:
  xor        eax, eax
  mov        ebx, 2   ; 8-6
  mov        [esp+RLeftShift], ebx
  mov        [esp+RLeftShift+4], eax
  mov        ebx, 4
  mov        [esp+GLeftShift], ebx
  mov        [esp+GLeftShift+4], eax
  mov        ebx, 8
  mov        [esp+RRightShift], ebx
  mov        [esp+RRightShift+4], eax
  mov        [esp+GRightShift], ebx
  mov        [esp+GRightShift+4], eax
  mov        ebx, 10
  mov        [esp+BRightShift], ebx
  mov        [esp+BRightShift+4], eax
  movq       mm0, [sixbitu]
  movq       [esp+RUpperLimit], mm0
  movq       [esp+GUpperLimit], mm0
  movq       mm0, [fourbitu]
  movq       [esp+BUpperLimit], mm0
  jmp        RGBEND

RGB655:
  xor        eax, eax
  mov        ebx, 2   ; 8-6
  mov        [esp+RLeftShift], ebx
  mov        [esp+RLeftShift+4], eax
  mov        ebx, 5
  mov        [esp+GLeftShift], ebx
  mov        [esp+GLeftShift+4], eax
  mov        ebx, 8
  mov        [esp+RRightShift], ebx
  mov        [esp+RRightShift+4], eax
  mov        ebx, 9
  mov        [esp+GRightShift], ebx
  mov        [esp+GRightShift+4], eax
  mov        [esp+BRightShift], ebx
  mov        [esp+BRightShift+4], eax
  movq       mm0, [sixbitu]
  movq       [esp+RUpperLimit], mm0
  movq       mm0, [fivebitu]
  movq       [esp+GUpperLimit], mm0
  movq       [esp+BUpperLimit], mm0
  jmp        RGBEND

RGB565:
  xor        eax, eax
  mov        ebx, 3   ; 8-5
  mov        [esp+RLeftShift], ebx
  mov        [esp+RLeftShift+4], eax
  mov        ebx, 5
  mov        [esp+GLeftShift], ebx
  mov        [esp+GLeftShift+4], eax
  mov        ebx, 9
  mov        [esp+RRightShift], ebx
  mov        [esp+RRightShift+4], eax
  mov        [esp+BRightShift], ebx
  mov        [esp+BRightShift+4], eax
  mov        ebx, 8
  mov        [esp+GRightShift], ebx
  mov        [esp+GRightShift+4], eax
  movq       mm0, [fivebitu]
  movq       [esp+RUpperLimit], mm0
  movq       [esp+BUpperLimit], mm0
  movq       mm0, [sixbitu]
  movq       [esp+GUpperLimit], mm0
;  jmp        RGBEND

RGBEND:
  mov        ebx, [esp+VPlane]
  mov        ecx, [esp+UPlane]
  sub        ecx, ebx
  mov        [esp+DistanceFromVToU], ecx

  mov        eax, [esp+ColorConvertedFrame]
  add        eax, [esp+DCIOffset]
  add        eax, [esp+CCOffsetToLine0]
  mov        [esp+CCOCursor], eax


  mov		ecx,[esp+YPitch]
  mov		ebx,[esp+FrameWidth]
  mov		eax,[esp+CCOPitch]
  sub        eax, ebx         ; CCOPitch-FrameWidth
  sub        eax, ebx         ; CCOPitch-2*FrameWidth
  sar        ebx, 1           ; FrameWidth/2
  mov		esi,[esp+YPlane]           ; Fetch cursor over luma plane.
  mov		[esp+ChromaLineLen],ebx    ; FrameWidth/2
  mov		[esp+ CCOSkipDistance],eax  ; CCOPitch-3*FrameWidth
  mov		[esp+YCursor],esi
  mov		edx,[esp+AspectAdjustmentCount]
  mov		esi,[esp+VPlane]

  cmp   edx,1
  je    near finish
  mov		[esp+AspectCount],edx
  mov		[esp+AspectBaseCount],edx
  xor        eax, eax

  mov		edi,[esp+ChromaLineLen]
  mov		[esp+EndOfChromaLine],edi
  mov		edi,[esp+CCOCursor]

  mov		edx,[esp+DistanceFromVToU]
  mov		ebp,[esp+YCursor]                       ; Fetch Y Pitch.
  mov		ebx,[esp+FrameWidth]

  add        ebp, ebx
  mov		[esp+tmpYCursorEven],ebp
  mov		eax,[esp+YPitch]
  add        ebp, eax
  mov		[esp+tmpYCursorOdd],ebp

  sar        ebx, 1
  add        esi, ebx
  add        edx, esi
  neg        ebx
  mov		[esp+FrameWidth],ebx

;  Register Usage:
;
;------------------------------------------------------------------------------
PrepareChromaLine:
  mov		ebp,[esp+AspectCount]
  mov		ebx,[esp+FrameWidth]
  sub    ebp,2
  mov		 eax,[esp+CCOPitch]
  mov		[esp+tmpCCOPitch],eax
   ja     continue

  xor    eax,eax
  add		ebp,[esp+AspectAdjustmentCount]
  mov		[esp+tmpCCOPitch],eax
continue:
  mov		[esp+AspectCount],ebp

do_next_8x2_block:
  mov		ebp,[esp+tmpYCursorEven]
; here is even line
  movd      mm1, [edx+ebx]         ; 4 u values
  pxor       mm0, mm0               ; mm0=0
  movd      mm2, [esi+ebx]         ; 4 v values
  punpcklbw  mm1, mm0               ; get 4 unsign u
  psubw      mm1, [Minusg]            ; get 4 unsign u-128
  punpcklbw  mm2, mm0               ; get unsign v
  psubw      mm2, [Minusg]            ; get unsign v-128
  movq       mm3, mm1               ; save the u-128 unsign
  movq       mm5, mm1               ; save u-128 unsign
  punpcklwd  mm1, mm2               ; get 2 low u, v unsign pairs
  pmaddwd    mm1, [UVtG]
   punpckhwd  mm3, mm2               ; create high 2 unsign uv pairs
  pmaddwd    mm3, [UVtG]
  movq       [temp_mmx+esp], mm2     ; save v-128
  movq       mm6, [ebp+2*ebx]       ; mm6 has 8 y pixels
  psubusb    mm6, [Yadd]              ; mm6 has 8 y-16 pixels
   packssdw   mm1, mm3               ; packed the results to signed words
  movq       mm7, mm6               ; save the 8 y-16 pixels
   punpcklbw  mm6, mm0               ; mm6 has 4 low y-16 unsign
  pmullw     mm6, [Ymul]
   punpckhbw  mm7, mm0               ; mm7 has 4 high y-16 unsign
  pmullw     mm7, [Ymul]
   movq       mm4, mm1
  movq       [temp_mmx+esp+8], mm1   ; save 4 chroma G values
   punpcklwd  mm1, mm1               ; chroma G replicate low 2
  movq       mm0, mm6               ; low  y
   punpckhwd  mm4, mm4               ; chroma G replicate high 2
  movq       mm3, mm7               ; high y
   psubw      mm6, mm1               ;  4 low G
  psraw      mm6, [esp+GRightShift]
   psubw      mm7, mm4               ; 4 high G values in signed 16 bit
  movq       mm2, mm5
   punpcklwd  mm5, mm5               ; replicate the 2 low u pixels
  pmullw     mm5, [UtB]
   punpckhwd  mm2, mm2
  psraw      mm7, [esp+GRightShift]
   pmullw     mm2, [UtB]
  packuswb   mm6, mm7               ; mm6: G7 G6 G5 G4 G3 G2 G1 G0
  movq       [temp_mmx+esp+16], mm5  ; low chroma B
   paddw      mm5, mm0               ; 4 low B values in signed 16 bit
  movq       [temp_mmx+esp+40], mm2  ; high chroma B
   paddw      mm2, mm3               ; 4 high B values in signed 16 bit
  psraw      mm5, [esp+BRightShift] ; low B scaled down by 6+(8-5)
  psraw      mm2, [esp+BRightShift] ; high B scaled down by 6+(8-5)
  packuswb   mm5, mm2               ; mm5: B7 B6 B5 B4 B3 B2 B1 B0

  movq       mm2, [temp_mmx+esp]     ; 4 v values
   movq       mm1, mm5               ; save B
  movq       mm7, mm2
   punpcklwd  mm2, mm2               ; replicate the 2 low v pixels
  pmullw     mm2, [VtR]
   punpckhwd  mm7, mm7
  pmullw     mm7, [VtR]
  paddusb    mm1, [esp+BUpperLimit] ; mm1: saturate B+0FF-15
  movq       [temp_mmx+esp+24], mm2  ; low chroma R
  paddw      mm2, mm0               ; 4 low R values in signed 16 bit
  psraw      mm2, [esp+RRightShift] ; low R scaled down by 6+(8-5)
   pxor       mm4, mm4               ; mm4=0 for 8-&gt;16 conversion
  movq       [temp_mmx+esp+32], mm7  ; high chroma R
   paddw      mm7, mm3               ; 4 high R values in signed 16 bit
  psraw      mm7, [esp+RRightShift] ; high R scaled down by 6+(8-5)
  psubusb    mm1, [esp+BUpperLimit]
   packuswb   mm2, mm7               ; mm2: R7 R6 R5 R4 R3 R2 R1 R0
  paddusb    mm6, [esp+GUpperLimit] ; G fast patch ih
  psubusb    mm6, [esp+GUpperLimit] ; fast patch ih
  paddusb    mm2, [esp+RUpperLimit] ; R
  psubusb    mm2, [esp+RUpperLimit]

; here we are packing from RGB24 to RGB16
; input:
       ; mm6: G7 G6 G5 G4 G3 G2 G1 G0
       ; mm1: B7 B6 B5 B4 B3 B2 B1 B0
       ; mm2: R7 R6 R5 R4 R3 R2 R1 R0
; assuming 8 original pixels in 0-H representation on mm6, mm5, mm2
; when  H=2**xBITS-1 (x is for R G B)
; output:
;        mm1- result: 4 low RGB16
;        mm7- result: 4 high RGB16
; using: mm0- zero register
;        mm3- temporary results
; algorithm:
;   for (i=0; i&lt;8; i++) {
;     RGB[i]=256*(R[i]&lt;&lt;(8-5))+(G[i]&lt;&lt;5)+B[i];
;   }

  psllq      mm2, [esp+RLeftShift]  ; position R in the most significant part of the byte
   movq       mm7, mm1               ; mm1: Save B

; note: no need for shift to place B on the least significant part of the byte
;   R in left position, B in the right position so they can be combined

  punpcklbw  mm1, mm2               ; mm1: 4 low 16 bit RB
   pxor       mm0, mm0               ; mm0: 0
  punpckhbw  mm7, mm2               ; mm5: 4 high 16 bit RB
   movq       mm3, mm6               ; mm3: G
  punpcklbw  mm6, mm0               ; mm6: low 4 G 16 bit
  psllw      mm6, [esp+GLeftShift]  ; shift low G 5 positions
  punpckhbw  mm3, mm0               ; mm3: high 4 G 16 bit
   por        mm1, mm6               ; mm1: low RBG16
  psllw      mm3, [esp+GLeftShift]  ; shift high G 5 positions
  por        mm7, mm3               ; mm5: high RBG16

  mov		ebp,[esp+tmpYCursorOdd]          ; moved to here to save cycles before odd line
  movq       [edi], mm1             ; !! aligned

;- start odd line
  movq       mm1, [ebp+2*ebx]       ; mm1 has 8 y pixels
   pxor       mm2, mm2
  psubusb    mm1, [Yadd]              ; mm1 has 8 pixels y-16
  movq       mm5, mm1
   punpcklbw  mm1, mm2               ; get 4 low y-16 unsign pixels word
  pmullw     mm1, [Ymul]              ; low 4 luminance contribution
   punpckhbw  mm5, mm2               ; 4 high y-16
  pmullw     mm5,  [Ymul]              ; high 4 luminance contribution
  movq       [edi+8], mm7           ; !! aligned
   movq       mm0, mm1
  paddw      mm0, [temp_mmx+esp+24]  ; low 4 R
   movq       mm6, mm5
  psraw      mm0, [esp+RRightShift] ; low R scaled down by 6+(8-5)
  paddw      mm5, [temp_mmx+esp+32]  ; high 4 R
   movq       mm2, mm1
  psraw      mm5, [esp+RRightShift] ; high R scaled down by 6+(8-5)
  paddw      mm2, [temp_mmx+esp+16]  ; low 4 B
   packuswb   mm0, mm5               ; mm0: R7 R6 R5 R4 R3 R2 R1 R0
  psraw      mm2, [esp+BRightShift] ; low B scaled down by 6+(8-5)
   movq       mm5, mm6
  paddw      mm6, [temp_mmx+esp+40]  ; high 4 B
  psraw      mm6, [esp+BRightShift] ; high B scaled down by 6+(8-5)
  movq       mm3, [temp_mmx+esp+8]   ; chroma G  low 4
  packuswb   mm2, mm6               ; mm2: B7 B6 B5 B4 B3 B2 B1 B0
   movq       mm4, mm3
  punpcklwd  mm3, mm3               ; replicate low 2
  punpckhwd  mm4, mm4               ; replicate high 2
   psubw      mm1, mm3               ;  4 low G
  psraw      mm1, [esp+GRightShift] ; low G scaled down by 6+(8-5)
   psubw      mm5, mm4               ;  4 high G values in signed 16 bit
  psraw      mm5, [esp+GRightShift] ; high G scaled down by 6+(8-5)
  paddusb    mm2, [esp+BUpperLimit] ; mm1: saturate B+0FF-15
   packuswb   mm1, mm5               ; mm1: G7 G6 G5 G4 G3 G2 G1 G0
  psubusb    mm2, [esp+BUpperLimit]
  paddusb    mm1, [esp+GUpperLimit] ; G
  psubusb    mm1, [esp+GUpperLimit]
  paddusb    mm0, [esp+RUpperLimit] ; R
  mov		eax,[esp+tmpCCOPitch]
  psubusb    mm0, [esp+RUpperLimit]

; here we are packing from RGB24 to RGB16
       ; mm1: G7 G6 G5 G4 G3 G2 G1 G0
       ; mm2: B7 B6 B5 B4 B3 B2 B1 B0
       ; mm0: R7 R6 R5 R4 R3 R2 R1 R0
; output:
;        mm2- result: 4 low RGB16
;        mm7- result: 4 high RGB16
; using: mm4- zero register
;        mm3- temporary results

  psllq       mm0, [esp+RLeftShift] ; position R in the most significant part of the byte
   movq        mm7, mm2              ; mm7: Save B

; note: no need for shift to place B on the least significant part of the byte
;   R in left position, B in the right position so they can be combined

  punpcklbw  mm2, mm0               ; mm1: 4 low 16 bit RB
   pxor       mm4, mm4               ; mm4: 0
  movq       mm3, mm1               ; mm3: G
   punpckhbw  mm7, mm0               ; mm7: 4 high 16 bit RB
  punpcklbw  mm1, mm4               ; mm1: low 4 G 16 bit
  punpckhbw  mm3, mm4               ; mm3: high 4 G 16 bit
  psllw      mm1, [esp+GLeftShift]  ; shift low G 5 positions
   por        mm2, mm1               ; mm2: low RBG16
  psllw      mm3, [esp+GLeftShift]  ; shift high G 5 positions
  por        mm7, mm3               ; mm7: high RBG16
  movq       [edi+eax], mm2
  movq       [edi+eax+8], mm7       ; aligned
  add        edi, 16                ; ih take 16 bytes (8 pixels-16 bit)
   add        ebx, 4                 ; ? to take 4 pixels together instead of 2
  jl         near do_next_8x2_block      ; ? update the loop for 8 y pixels at once

  add		edi,[esp+CCOSkipDistance]        ; go to begin of next line
  add 		edi,[esp+tmpCCOPitch]           ; skip odd line (if it is needed)
; Leax       AspectCount
; Lebp       CCOPitch               ; skip odd line

; sub        eax, 2
; jg         @f

; Addeax     AspectBaseCount
; xor        ebp, ebp

;@@:
;  Seax       AspectCount
;  add        edi, ebp

  mov		eax,[esp+YPitch]
  mov		ebp,[esp+tmpYCursorOdd]
  add        ebp, eax       ; skip one line
;  lea        ebp, [ebp+2*eax]       ; skip two lines
  mov		[esp+tmpYCursorEven],ebp
;  Sebp       tmpYCursorOdd

  add        ebp, eax       ; skip one line
  mov		[esp+tmpYCursorOdd],ebp
;  Lebp       tmpYCursorEven
;  lea        ebp, [ebp+2*eax]
;  Sebp       tmpYCursorEven


  add		esi,[esp+ChromaPitch]
  add		edx,[esp+ChromaPitch]


;  Leax       YLimit                  ; Done with last line?
;  cmp        ebp, eax
;  jbe        PrepareChromaLine
   sub      word [esp+FrameHeight],2
   ja       near PrepareChromaLine


;------------------------------------------------------------------------------
finish:
  emms
  add        esp, LocalFrameSize

  pop        ebx
  pop        ebp
  pop        edi
  pop        esi
  ret 


