/*****************************************************************************
 *
 *  XVID MPEG-4 VIDEO CODEC
 *  - colorspace conversion module -
 *
 *  Copyright(C) 2002 Peter Ross <pross@xvid.org>
 *               2002 Michael Militzer <isibaar@xvid.org>
 *
 *  This file is part of XviD, a free MPEG-4 video encoder/decoder
 *
 *  XviD is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
 *
 *  Under section 8 of the GNU General Public License, the copyright
 *  holders of XVID explicitly forbid distribution in the following
 *  countries:
 *
 *    - Japan
 *    - United States of America
 *
 *  Linking XviD statically or dynamically with other modules is making a
 *  combined work based on XviD.  Thus, the terms and conditions of the
 *  GNU General Public License cover the whole combination.
 *
 *  As a special exception, the copyright holders of XviD give you
 *  permission to link XviD with independent modules that communicate with
 *  XviD solely through the VFW1.1 and DShow interfaces, regardless of the
 *  license terms of these independent modules, and to copy and distribute
 *  the resulting combined work under terms of your choice, provided that
 *  every copy of the combined work is accompanied by a complete copy of
 *  the source code of XviD (the version of XviD used to produce the
 *  combined work), being distributed under the terms of the GNU General
 *  Public License plus this exception.  An independent module is a module
 *  which is not derived from or based on XviD.
 *
 *  Note that people who make modified versions of XviD are not obligated
 *  to grant this special exception for their modified versions; it is
 *  their choice whether to do so.  The GNU General Public License gives
 *  permission to release a modified version without this exception; this
 *  exception also makes it possible to release a modified version which
 *  carries forward this exception.
 *
 * $Id: colorspace.c,v 1.7 2002-11-26 23:44:10 edgomez Exp $
 *
 ****************************************************************************/

#include <string.h>				/* memcpy */

#include "colorspace.h"
#include "../divx4.h"			/* DEC_PICTURE */

/* function pointers */

/* input */
color_inputFuncPtr rgb555_to_yv12;
color_inputFuncPtr rgb565_to_yv12;
color_inputFuncPtr rgb24_to_yv12;
color_inputFuncPtr rgb32_to_yv12;
color_inputFuncPtr yuv_to_yv12;
color_inputFuncPtr yuyv_to_yv12;
color_inputFuncPtr uyvy_to_yv12;

/* output */
color_outputFuncPtr yv12_to_rgb555;
color_outputFuncPtr yv12_to_rgb565;
color_outputFuncPtr yv12_to_rgb24;
color_outputFuncPtr yv12_to_rgb32;
color_outputFuncPtr yv12_to_yuv;
color_outputFuncPtr yv12_to_yuyv;
color_outputFuncPtr yv12_to_uyvy;

#define MIN(A,B)	((A)<(B)?(A):(B))
#define MAX(A,B)	((A)>(B)?(A):(B))

/*	rgb -> yuv def's

	this following constants are "official spec"
	Video Demystified" (ISBN 1-878707-09-4)

	rgb<->yuv _is_ lossy, since most programs do the conversion differently
		
	SCALEBITS/FIX taken from  ffmpeg
*/

#define Y_R_IN			0.257
#define Y_G_IN			0.504
#define Y_B_IN			0.098
#define Y_ADD_IN		16

#define U_R_IN			0.148
#define U_G_IN			0.291
#define U_B_IN			0.439
#define U_ADD_IN		128

#define V_R_IN			0.439
#define V_G_IN			0.368
#define V_B_IN			0.071
#define V_ADD_IN		128

#define SCALEBITS_IN	8
#define FIX_IN(x)		((uint16_t) ((x) * (1L<<SCALEBITS_IN) + 0.5))


int32_t RGB_Y_tab[256];
int32_t B_U_tab[256];
int32_t G_U_tab[256];
int32_t G_V_tab[256];
int32_t R_V_tab[256];


/* rgb555 -> yuv 4:2:0 planar */
void
rgb555_to_yv12_c(uint8_t * y_out,
				 uint8_t * u_out,
				 uint8_t * v_out,
				 uint8_t * src,
				 int width,
				 int height,
				 int y_stride)
{
	int32_t src_stride = width * 2;
	uint32_t y_dif = y_stride - width;
	uint32_t uv_dif = (y_stride - width) / 2;
	uint32_t x, y;

	if (height < 0) {
		height = -height;
		src += (height - 1) * src_stride;
		src_stride = -src_stride;
	}


	for (y = height / 2; y; y--) {
		/* process one 2x2 block per iteration */
		for (x = 0; x < (uint32_t) width; x += 2) {
			int rgb, r, g, b, r4, g4, b4;

			rgb = *(uint16_t *) (src + x * 2);
			b4 = b = (rgb << 3) & 0xf8;
			g4 = g = (rgb >> 2) & 0xf8;
			r4 = r = (rgb >> 7) & 0xf8;
			y_out[0] =
				(uint8_t) ((FIX_IN(Y_R_IN) * r + FIX_IN(Y_G_IN) * g +
							FIX_IN(Y_B_IN) * b) >> SCALEBITS_IN) + Y_ADD_IN;

			rgb = *(uint16_t *) (src + x * 2 + src_stride);
			b4 += b = (rgb << 3) & 0xf8;
			g4 += g = (rgb >> 2) & 0xf8;
			r4 += r = (rgb >> 7) & 0xf8;
			y_out[y_stride] =
				(uint8_t) ((FIX_IN(Y_R_IN) * r + FIX_IN(Y_G_IN) * g +
							FIX_IN(Y_B_IN) * b) >> SCALEBITS_IN) + Y_ADD_IN;

			rgb = *(uint16_t *) (src + x * 2 + 2);
			b4 += b = (rgb << 3) & 0xf8;
			g4 += g = (rgb >> 2) & 0xf8;
			r4 += r = (rgb >> 7) & 0xf8;
			y_out[1] =
				(uint8_t) ((FIX_IN(Y_R_IN) * r + FIX_IN(Y_G_IN) * g +
							FIX_IN(Y_B_IN) * b) >> SCALEBITS_IN) + Y_ADD_IN;

			rgb = *(uint16_t *) (src + x * 2 + src_stride + 2);
			b4 += b = (rgb << 3) & 0xf8;
			g4 += g = (rgb >> 2) & 0xf8;
			r4 += r = (rgb >> 7) & 0xf8;
			y_out[y_stride + 1] =
				(uint8_t) ((FIX_IN(Y_R_IN) * r + FIX_IN(Y_G_IN) * g +
							FIX_IN(Y_B_IN) * b) >> SCALEBITS_IN) + Y_ADD_IN;

			*u_out++ =
				(uint8_t) ((-FIX_IN(U_R_IN) * r4 - FIX_IN(U_G_IN) * g4 +
							FIX_IN(U_B_IN) * b4) >> (SCALEBITS_IN + 2)) +
				U_ADD_IN;


			*v_out++ =
				(uint8_t) ((FIX_IN(V_R_IN) * r4 - FIX_IN(V_G_IN) * g4 -
							FIX_IN(V_B_IN) * b4) >> (SCALEBITS_IN + 2)) +
				V_ADD_IN;

			y_out += 2;
		}
		src += src_stride * 2;
		y_out += y_dif + y_stride;
		u_out += uv_dif;
		v_out += uv_dif;
	}
}


/* rgb565_to_yuv_c
	NOTE:	identical to rgb555 except for shift/mask 
			not tested */

void
rgb565_to_yv12_c(uint8_t * y_out,
				 uint8_t * u_out,
				 uint8_t * v_out,
				 uint8_t * src,
				 int width,
				 int height,
				 int y_stride)
{
	int32_t src_stride = width * 2;

	uint32_t y_dif = y_stride - width;
	uint32_t uv_dif = (y_stride - width) / 2;
	uint32_t x, y;

	if (height < 0) {
		height = -height;
		src += (height - 1) * src_stride;
		src_stride = -src_stride;
	}


	for (y = height / 2; y; y--) {
		/* process one 2x2 block per iteration */
		for (x = 0; x < (uint32_t) width; x += 2) {
			int rgb, r, g, b, r4, g4, b4;

			rgb = *(uint16_t *) (src + x * 2);
			b4 = b = (rgb << 3) & 0xf8;
			g4 = g = (rgb >> 3) & 0xfc;
			r4 = r = (rgb >> 8) & 0xf8;
			y_out[0] =
				(uint8_t) ((FIX_IN(Y_R_IN) * r + FIX_IN(Y_G_IN) * g +
							FIX_IN(Y_B_IN) * b) >> SCALEBITS_IN) + Y_ADD_IN;

			rgb = *(uint16_t *) (src + x * 2 + src_stride);
			b4 += b = (rgb << 3) & 0xf8;
			g4 += g = (rgb >> 3) & 0xfc;
			r4 += r = (rgb >> 8) & 0xf8;
			y_out[y_stride] =
				(uint8_t) ((FIX_IN(Y_R_IN) * r + FIX_IN(Y_G_IN) * g +
							FIX_IN(Y_B_IN) * b) >> SCALEBITS_IN) + Y_ADD_IN;

			rgb = *(uint16_t *) (src + x * 2 + 2);
			b4 += b = (rgb << 3) & 0xf8;
			g4 += g = (rgb >> 3) & 0xfc;
			r4 += r = (rgb >> 8) & 0xf8;
			y_out[1] =
				(uint8_t) ((FIX_IN(Y_R_IN) * r + FIX_IN(Y_G_IN) * g +
							FIX_IN(Y_B_IN) * b) >> SCALEBITS_IN) + Y_ADD_IN;

			rgb = *(uint16_t *) (src + x * 2 + src_stride + 2);
			b4 += b = (rgb << 3) & 0xf8;
			g4 += g = (rgb >> 3) & 0xfc;
			r4 += r = (rgb >> 8) & 0xf8;
			y_out[y_stride + 1] =
				(uint8_t) ((FIX_IN(Y_R_IN) * r + FIX_IN(Y_G_IN) * g +
							FIX_IN(Y_B_IN) * b) >> SCALEBITS_IN) + Y_ADD_IN;

			*u_out++ =
				(uint8_t) ((-FIX_IN(U_R_IN) * r4 - FIX_IN(U_G_IN) * g4 +
							FIX_IN(U_B_IN) * b4) >> (SCALEBITS_IN + 2)) +
				U_ADD_IN;


			*v_out++ =
				(uint8_t) ((FIX_IN(V_R_IN) * r4 - FIX_IN(V_G_IN) * g4 -
							FIX_IN(V_B_IN) * b4) >> (SCALEBITS_IN + 2)) +
				V_ADD_IN;

			y_out += 2;
		}
		src += src_stride * 2;
		y_out += y_dif + y_stride;
		u_out += uv_dif;
		v_out += uv_dif;
	}
}


/*	rgb24 -> yuv 4:2:0 planar 

	NOTE: always flips.
*/

void
rgb24_to_yv12_c(uint8_t * y_out,
				uint8_t * u_out,
				uint8_t * v_out,
				uint8_t * src,
				int width,
				int height,
				int stride)
{
	uint32_t width3 = (width << 1) + width;	/* width * 3 */
	uint32_t src_dif = (width << 3) + width;	/* width3 * 3 */
	uint32_t y_dif = (stride << 1) - width;
	uint32_t uv_dif = (stride - width) >> 1;
	uint32_t x, y;

	src += (height - 2) * width3;


	for (y = height >> 1; y; y--) {
		for (x = width >> 1; x; x--) {
			uint32_t r, g, b, r4, g4, b4;

			b4 = b = src[0];
			g4 = g = src[1];
			r4 = r = src[2];
			y_out[stride + 0] =
				(uint8_t) ((FIX_IN(Y_R_IN) * r + FIX_IN(Y_G_IN) * g +
							FIX_IN(Y_B_IN) * b) >> SCALEBITS_IN) + Y_ADD_IN;

			b4 += (b = src[3]);
			g4 += (g = src[4]);
			r4 += (r = src[5]);
			y_out[stride + 1] =
				(uint8_t) ((FIX_IN(Y_R_IN) * r + FIX_IN(Y_G_IN) * g +
							FIX_IN(Y_B_IN) * b) >> SCALEBITS_IN) + Y_ADD_IN;

			b4 += (b = src[width3 + 0]);
			g4 += (g = src[width3 + 1]);
			r4 += (r = src[width3 + 2]);
			y_out[0] =
				(uint8_t) ((FIX_IN(Y_R_IN) * r + FIX_IN(Y_G_IN) * g +
							FIX_IN(Y_B_IN) * b) >> SCALEBITS_IN) + Y_ADD_IN;

			b4 += (b = src[width3 + 3]);
			g4 += (g = src[width3 + 4]);
			r4 += (r = src[width3 + 5]);
			y_out[1] =
				(uint8_t) ((FIX_IN(Y_R_IN) * r + FIX_IN(Y_G_IN) * g +
							FIX_IN(Y_B_IN) * b) >> SCALEBITS_IN) + Y_ADD_IN;

			*u_out++ =
				(uint8_t) ((-FIX_IN(U_R_IN) * r4 - FIX_IN(U_G_IN) * g4 +
							FIX_IN(U_B_IN) * b4) >> (SCALEBITS_IN + 2)) +
				U_ADD_IN;


			*v_out++ =
				(uint8_t) ((FIX_IN(V_R_IN) * r4 - FIX_IN(V_G_IN) * g4 -
							FIX_IN(V_B_IN) * b4) >> (SCALEBITS_IN + 2)) +
				V_ADD_IN;


			src += 6;
			y_out += 2;
		}
		src -= src_dif;
		y_out += y_dif;
		u_out += uv_dif;
		v_out += uv_dif;
	}
}


/*	rgb32 -> yuv 4:2:0 planar 

	NOTE: always flips
*/

void
rgb32_to_yv12_c(uint8_t * y_out,
				uint8_t * u_out,
				uint8_t * v_out,
				uint8_t * src,
				int width,
				int height,
				int stride)
{
	uint32_t width4 = (width << 2);	/* width * 4 */
	uint32_t src_dif = 3 * width4;
	uint32_t y_dif = (stride << 1) - width;
	uint32_t uv_dif = (stride - width) >> 1;
	uint32_t x, y;

	src += (height - 2) * width4;

	for (y = height >> 1; y; y--) {
		for (x = width >> 1; x; x--) {
			uint32_t r, g, b, r4, g4, b4;

			b4 = b = src[0];
			g4 = g = src[1];
			r4 = r = src[2];
			y_out[stride + 0] =
				(uint8_t) ((FIX_IN(Y_R_IN) * r + FIX_IN(Y_G_IN) * g +
							FIX_IN(Y_B_IN) * b) >> SCALEBITS_IN) + Y_ADD_IN;

			b4 += (b = src[4]);
			g4 += (g = src[5]);
			r4 += (r = src[6]);
			y_out[stride + 1] =
				(uint8_t) ((FIX_IN(Y_R_IN) * r + FIX_IN(Y_G_IN) * g +
							FIX_IN(Y_B_IN) * b) >> SCALEBITS_IN) + Y_ADD_IN;

			b4 += (b = src[width4 + 0]);
			g4 += (g = src[width4 + 1]);
			r4 += (r = src[width4 + 2]);

			y_out[0] =
				(uint8_t) ((FIX_IN(Y_R_IN) * r + FIX_IN(Y_G_IN) * g +
							FIX_IN(Y_B_IN) * b) >> SCALEBITS_IN) + Y_ADD_IN;

			b4 += (b = src[width4 + 4]);
			g4 += (g = src[width4 + 5]);
			r4 += (r = src[width4 + 6]);
			y_out[1] =
				(uint8_t) ((FIX_IN(Y_R_IN) * r + FIX_IN(Y_G_IN) * g +
							FIX_IN(Y_B_IN) * b) >> SCALEBITS_IN) + Y_ADD_IN;

			*u_out++ =
				(uint8_t) ((-FIX_IN(U_R_IN) * r4 - FIX_IN(U_G_IN) * g4 +
							FIX_IN(U_B_IN) * b4) >> (SCALEBITS_IN + 2)) +
				U_ADD_IN;

			*v_out++ =
				(uint8_t) ((FIX_IN(V_R_IN) * r4 - FIX_IN(V_G_IN) * g4 -
							FIX_IN(V_B_IN) * b4) >> (SCALEBITS_IN + 2)) +
				V_ADD_IN;

			src += 8;
			y_out += 2;
		}
		src -= src_dif;
		y_out += y_dif;
		u_out += uv_dif;
		v_out += uv_dif;
	}
}

/*	yuv planar -> yuv 4:2:0 planar
   
	NOTE: does not flip */

void
yuv_to_yv12_c(uint8_t * y_out,
			  uint8_t * u_out,
			  uint8_t * v_out,
			  uint8_t * src,
			  int width,
			  int height,
			  int stride)
{
	uint32_t stride2 = stride >> 1;
	uint32_t width2 = width >> 1;
	uint32_t y;

	for (y = height; y; y--) {
		memcpy(y_out, src, width);
		src += width;
		y_out += stride;
	}

	for (y = height >> 1; y; y--) {
		memcpy(u_out, src, width2);
		src += width2;
		u_out += stride2;
	}

	for (y = height >> 1; y; y--) {
		memcpy(v_out, src, width2);
		src += width2;
		v_out += stride2;
	}
}


/* yuyv (yuv2) packed -> yuv 4:2:0 planar
   
   NOTE: does not flip */

void
yuyv_to_yv12_c(uint8_t * y_out,
			   uint8_t * u_out,
			   uint8_t * v_out,
			   uint8_t * src,
			   int width,
			   int height,
			   int stride)
{
	uint32_t width2 = width + width;
	uint32_t y_dif = stride - width;
	uint32_t uv_dif = y_dif >> 1;
	uint32_t x, y;

	for (y = height >> 1; y; y--) {

		for (x = width >> 1; x; x--) {
			*y_out++ = *src++;
			/**u_out++ = *src++; */
			*u_out++ = (*(src + width2) + *src) >> 1;
			src++;
			*y_out++ = *src++;
			/**v_out++ = *src++; */
			*v_out++ = (*(src + width2) + *src) >> 1;
			src++;

		}

		y_out += y_dif;
		u_out += uv_dif;
		v_out += uv_dif;

		for (x = width >> 1; x; x--) {
			*y_out++ = *src++;
			src++;
			*y_out++ = *src++;
			src++;
		}

		y_out += y_dif;

	}

}


/* uyvy packed -> yuv 4:2:0 planar
   
   NOTE: does not flip */


void
uyvy_to_yv12_c(uint8_t * y_out,
			   uint8_t * u_out,
			   uint8_t * v_out,
			   uint8_t * src,
			   int width,
			   int height,
			   int stride)
{
	uint32_t width2 = width + width;
	uint32_t y_dif = stride - width;
	uint32_t uv_dif = y_dif >> 1;
	uint32_t x, y;

	for (y = height >> 1; y; y--) {

		for (x = width >> 1; x; x--) {
			*u_out++ = *src++;
			/* *u_out++ = (*(src+width2) + *src++) >> 1; */
			*y_out++ = *src++;
			/**v_out++ = *src++; */
			*v_out++ = (*(src + width2) + *src) >> 1;
			src++;
			*y_out++ = *src++;
		}

		y_out += y_dif;
		u_out += uv_dif;;
		v_out += uv_dif;;

		for (x = width >> 1; x; x--) {
			src++;
			*y_out++ = *src++;
			src++;
			*y_out++ = *src++;
		}

		y_out += y_dif;
	}
}


/*	yuv -> rgb def's */

#define RGB_Y_OUT		1.164
#define B_U_OUT			2.018
#define Y_ADD_OUT		16

#define G_U_OUT			0.391
#define G_V_OUT			0.813
#define U_ADD_OUT		128

#define R_V_OUT			1.596
#define V_ADD_OUT		128


#define SCALEBITS_OUT	13
#define FIX_OUT(x)		((uint16_t) ((x) * (1L<<SCALEBITS_OUT) + 0.5))


/* initialize rgb lookup tables */

void
colorspace_init(void)
{
	int32_t i;

	for (i = 0; i < 256; i++) {
		RGB_Y_tab[i] = FIX_OUT(RGB_Y_OUT) * (i - Y_ADD_OUT);
		B_U_tab[i] = FIX_OUT(B_U_OUT) * (i - U_ADD_OUT);
		G_U_tab[i] = FIX_OUT(G_U_OUT) * (i - U_ADD_OUT);
		G_V_tab[i] = FIX_OUT(G_V_OUT) * (i - V_ADD_OUT);
		R_V_tab[i] = FIX_OUT(R_V_OUT) * (i - V_ADD_OUT);
	}
}

/* yuv 4:2:0 planar -> rgb555 + very simple error diffusion
*/

#define MK_RGB555(R,G,B)	((MAX(0,MIN(255, R)) << 7) & 0x7c00) | \
							((MAX(0,MIN(255, G)) << 2) & 0x03e0) | \
							((MAX(0,MIN(255, B)) >> 3) & 0x001f)


void
yv12_to_rgb555_c(uint8_t * dst,
				 int dst_stride,
				 uint8_t * y_src,
				 uint8_t * u_src,
				 uint8_t * v_src,
				 int y_stride,
				 int uv_stride,
				 int width,
				 int height)
{
	const uint32_t dst_dif = 4 * dst_stride - 2 * width;
	int32_t y_dif = 2 * y_stride - width;

	uint8_t *dst2 = dst + 2 * dst_stride;
	uint8_t *y_src2 = y_src + y_stride;
	uint32_t x, y;

	if (height < 0) {
		height = -height;
		y_src += (height - 1) * y_stride;
		y_src2 = y_src - y_stride;
		u_src += (height / 2 - 1) * uv_stride;
		v_src += (height / 2 - 1) * uv_stride;
		y_dif = -width - 2 * y_stride;
		uv_stride = -uv_stride;
	}

	for (y = height / 2; y; y--) {
		int r, g, b;
		int r2, g2, b2;

		r = g = b = 0;
		r2 = g2 = b2 = 0;

		/* process one 2x2 block per iteration */
		for (x = 0; x < (uint32_t) width / 2; x++) {
			int u, v;
			int b_u, g_uv, r_v, rgb_y;

			u = u_src[x];
			v = v_src[x];

			b_u = B_U_tab[u];
			g_uv = G_U_tab[u] + G_V_tab[v];
			r_v = R_V_tab[v];

			rgb_y = RGB_Y_tab[*y_src];
			b = (b & 0x7) + ((rgb_y + b_u) >> SCALEBITS_OUT);
			g = (g & 0x7) + ((rgb_y - g_uv) >> SCALEBITS_OUT);
			r = (r & 0x7) + ((rgb_y + r_v) >> SCALEBITS_OUT);
			*(uint16_t *) dst = MK_RGB555(r, g, b);

			y_src++;
			rgb_y = RGB_Y_tab[*y_src];
			b = (b & 0x7) + ((rgb_y + b_u) >> SCALEBITS_OUT);
			g = (g & 0x7) + ((rgb_y - g_uv) >> SCALEBITS_OUT);
			r = (r & 0x7) + ((rgb_y + r_v) >> SCALEBITS_OUT);
			*(uint16_t *) (dst + 2) = MK_RGB555(r, g, b);
			y_src++;

			rgb_y = RGB_Y_tab[*y_src2];
			b2 = (b2 & 0x7) + ((rgb_y + b_u) >> SCALEBITS_OUT);
			g2 = (g2 & 0x7) + ((rgb_y - g_uv) >> SCALEBITS_OUT);
			r2 = (r2 & 0x7) + ((rgb_y + r_v) >> SCALEBITS_OUT);
			*(uint16_t *) (dst2) = MK_RGB555(r2, g2, b2);
			y_src2++;

			rgb_y = RGB_Y_tab[*y_src2];
			b2 = (b2 & 0x7) + ((rgb_y + b_u) >> SCALEBITS_OUT);
			g2 = (g2 & 0x7) + ((rgb_y - g_uv) >> SCALEBITS_OUT);
			r2 = (r2 & 0x7) + ((rgb_y + r_v) >> SCALEBITS_OUT);
			*(uint16_t *) (dst2 + 2) = MK_RGB555(r2, g2, b2);
			y_src2++;

			dst += 4;
			dst2 += 4;
		}

		dst += dst_dif;
		dst2 += dst_dif;

		y_src += y_dif;
		y_src2 += y_dif;

		u_src += uv_stride;
		v_src += uv_stride;
	}
}


/* yuv 4:2:0 planar -> rgb565 + very simple error diffusion
	NOTE:	identical to rgb555 except for shift/mask  */


#define MK_RGB565(R,G,B)	((MAX(0,MIN(255, R)) << 8) & 0xf800) | \
							((MAX(0,MIN(255, G)) << 3) & 0x07e0) | \
							((MAX(0,MIN(255, B)) >> 3) & 0x001f)

void
yv12_to_rgb565_c(uint8_t * dst,
				 int dst_stride,
				 uint8_t * y_src,
				 uint8_t * u_src,
				 uint8_t * v_src,
				 int y_stride,
				 int uv_stride,
				 int width,
				 int height)
{
	const uint32_t dst_dif = 4 * dst_stride - 2 * width;
	int32_t y_dif = 2 * y_stride - width;

	uint8_t *dst2 = dst + 2 * dst_stride;
	uint8_t *y_src2 = y_src + y_stride;
	uint32_t x, y;

	if (height < 0) {			/* flip image? */
		height = -height;
		y_src += (height - 1) * y_stride;
		y_src2 = y_src - y_stride;
		u_src += (height / 2 - 1) * uv_stride;
		v_src += (height / 2 - 1) * uv_stride;
		y_dif = -width - 2 * y_stride;
		uv_stride = -uv_stride;
	}

	for (y = height / 2; y; y--) {
		int r, g, b;
		int r2, g2, b2;

		r = g = b = 0;
		r2 = g2 = b2 = 0;

		/* process one 2x2 block per iteration */
		for (x = 0; x < (uint32_t) width / 2; x++) {
			int u, v;
			int b_u, g_uv, r_v, rgb_y;

			u = u_src[x];
			v = v_src[x];

			b_u = B_U_tab[u];
			g_uv = G_U_tab[u] + G_V_tab[v];
			r_v = R_V_tab[v];

			rgb_y = RGB_Y_tab[*y_src];
			b = (b & 0x7) + ((rgb_y + b_u) >> SCALEBITS_OUT);
			g = (g & 0x7) + ((rgb_y - g_uv) >> SCALEBITS_OUT);
			r = (r & 0x7) + ((rgb_y + r_v) >> SCALEBITS_OUT);
			*(uint16_t *) dst = MK_RGB565(r, g, b);

			y_src++;
			rgb_y = RGB_Y_tab[*y_src];
			b = (b & 0x7) + ((rgb_y + b_u) >> SCALEBITS_OUT);
			g = (g & 0x7) + ((rgb_y - g_uv) >> SCALEBITS_OUT);
			r = (r & 0x7) + ((rgb_y + r_v) >> SCALEBITS_OUT);
			*(uint16_t *) (dst + 2) = MK_RGB565(r, g, b);
			y_src++;

			rgb_y = RGB_Y_tab[*y_src2];
			b2 = (b2 & 0x7) + ((rgb_y + b_u) >> SCALEBITS_OUT);
			g2 = (g2 & 0x7) + ((rgb_y - g_uv) >> SCALEBITS_OUT);
			r2 = (r2 & 0x7) + ((rgb_y + r_v) >> SCALEBITS_OUT);
			*(uint16_t *) (dst2) = MK_RGB565(r2, g2, b2);
			y_src2++;

			rgb_y = RGB_Y_tab[*y_src2];
			b2 = (b2 & 0x7) + ((rgb_y + b_u) >> SCALEBITS_OUT);
			g2 = (g2 & 0x7) + ((rgb_y - g_uv) >> SCALEBITS_OUT);
			r2 = (r2 & 0x7) + ((rgb_y + r_v) >> SCALEBITS_OUT);
			*(uint16_t *) (dst2 + 2) = MK_RGB565(r2, g2, b2);
			y_src2++;

			dst += 4;
			dst2 += 4;
		}

		dst += dst_dif;
		dst2 += dst_dif;

		y_src += y_dif;
		y_src2 += y_dif;

		u_src += uv_stride;
		v_src += uv_stride;
	}
}


/* yuv 4:2:0 planar -> rgb24 */

void
yv12_to_rgb24_c(uint8_t * dst,
				int dst_stride,
				uint8_t * y_src,
				uint8_t * u_src,
				uint8_t * v_src,
				int y_stride,
				int uv_stride,
				int width,
				int height)
{
	const uint32_t dst_dif = 6 * dst_stride - 3 * width;
	int32_t y_dif = 2 * y_stride - width;

	uint8_t *dst2 = dst + 3 * dst_stride;
	uint8_t *y_src2 = y_src + y_stride;
	uint32_t x, y;

	if (height < 0) {			/* flip image? */
		height = -height;
		y_src += (height - 1) * y_stride;
		y_src2 = y_src - y_stride;
		u_src += (height / 2 - 1) * uv_stride;
		v_src += (height / 2 - 1) * uv_stride;
		y_dif = -width - 2 * y_stride;
		uv_stride = -uv_stride;
	}

	for (y = height / 2; y; y--) {
		/* process one 2x2 block per iteration */
		for (x = 0; x < (uint32_t) width / 2; x++) {
			int u, v;
			int b_u, g_uv, r_v, rgb_y;
			int r, g, b;

			u = u_src[x];
			v = v_src[x];

			b_u = B_U_tab[u];
			g_uv = G_U_tab[u] + G_V_tab[v];
			r_v = R_V_tab[v];

			rgb_y = RGB_Y_tab[*y_src];
			b = (rgb_y + b_u) >> SCALEBITS_OUT;
			g = (rgb_y - g_uv) >> SCALEBITS_OUT;
			r = (rgb_y + r_v) >> SCALEBITS_OUT;
			dst[0] = MAX(0, MIN(255, b));
			dst[1] = MAX(0, MIN(255, g));
			dst[2] = MAX(0, MIN(255, r));

			y_src++;
			rgb_y = RGB_Y_tab[*y_src];
			b = (rgb_y + b_u) >> SCALEBITS_OUT;
			g = (rgb_y - g_uv) >> SCALEBITS_OUT;
			r = (rgb_y + r_v) >> SCALEBITS_OUT;
			dst[3] = MAX(0, MIN(255, b));
			dst[4] = MAX(0, MIN(255, g));
			dst[5] = MAX(0, MIN(255, r));
			y_src++;

			rgb_y = RGB_Y_tab[*y_src2];
			b = (rgb_y + b_u) >> SCALEBITS_OUT;
			g = (rgb_y - g_uv) >> SCALEBITS_OUT;
			r = (rgb_y + r_v) >> SCALEBITS_OUT;
			dst2[0] = MAX(0, MIN(255, b));
			dst2[1] = MAX(0, MIN(255, g));
			dst2[2] = MAX(0, MIN(255, r));
			y_src2++;

			rgb_y = RGB_Y_tab[*y_src2];
			b = (rgb_y + b_u) >> SCALEBITS_OUT;
			g = (rgb_y - g_uv) >> SCALEBITS_OUT;
			r = (rgb_y + r_v) >> SCALEBITS_OUT;
			dst2[3] = MAX(0, MIN(255, b));
			dst2[4] = MAX(0, MIN(255, g));
			dst2[5] = MAX(0, MIN(255, r));
			y_src2++;

			dst += 6;
			dst2 += 6;
		}

		dst += dst_dif;
		dst2 += dst_dif;

		y_src += y_dif;
		y_src2 += y_dif;

		u_src += uv_stride;
		v_src += uv_stride;
	}
}


/* yuv 4:2:0 planar -> rgb32 */

void
yv12_to_rgb32_c(uint8_t * dst,
				int dst_stride,
				uint8_t * y_src,
				uint8_t * v_src,
				uint8_t * u_src,
				int y_stride,
				int uv_stride,
				int width,
				int height)
{
	const uint32_t dst_dif = 8 * dst_stride - 4 * width;
	int32_t y_dif = 2 * y_stride - width;

	uint8_t *dst2 = dst + 4 * dst_stride;
	uint8_t *y_src2 = y_src + y_stride;
	uint32_t x, y;

	if (height < 0) {			/* flip image? */
		height = -height;
		y_src += (height - 1) * y_stride;
		y_src2 = y_src - y_stride;
		u_src += (height / 2 - 1) * uv_stride;
		v_src += (height / 2 - 1) * uv_stride;
		y_dif = -width - 2 * y_stride;
		uv_stride = -uv_stride;
	}

	for (y = height / 2; y; y--) {
		/* process one 2x2 block per iteration */
		for (x = 0; x < (uint32_t) width / 2; x++) {
			int u, v;
			int b_u, g_uv, r_v, rgb_y;
			int r, g, b;

			u = u_src[x];
			v = v_src[x];

			b_u = B_U_tab[u];
			g_uv = G_U_tab[u] + G_V_tab[v];
			r_v = R_V_tab[v];

			rgb_y = RGB_Y_tab[*y_src];
			b = (rgb_y + b_u) >> SCALEBITS_OUT;
			g = (rgb_y - g_uv) >> SCALEBITS_OUT;
			r = (rgb_y + r_v) >> SCALEBITS_OUT;
			dst[0] = MAX(0, MIN(255, r));
			dst[1] = MAX(0, MIN(255, g));
			dst[2] = MAX(0, MIN(255, b));
			dst[3] = 0;

			y_src++;
			rgb_y = RGB_Y_tab[*y_src];
			b = (rgb_y + b_u) >> SCALEBITS_OUT;
			g = (rgb_y - g_uv) >> SCALEBITS_OUT;
			r = (rgb_y + r_v) >> SCALEBITS_OUT;
			dst[4] = MAX(0, MIN(255, r));
			dst[5] = MAX(0, MIN(255, g));
			dst[6] = MAX(0, MIN(255, b));
			dst[7] = 0;
			y_src++;

			rgb_y = RGB_Y_tab[*y_src2];
			b = (rgb_y + b_u) >> SCALEBITS_OUT;
			g = (rgb_y - g_uv) >> SCALEBITS_OUT;
			r = (rgb_y + r_v) >> SCALEBITS_OUT;
			dst2[0] = MAX(0, MIN(255, r));
			dst2[1] = MAX(0, MIN(255, g));
			dst2[2] = MAX(0, MIN(255, b));
			dst2[3] = 0;
			y_src2++;

			rgb_y = RGB_Y_tab[*y_src2];
			b = (rgb_y + b_u) >> SCALEBITS_OUT;
			g = (rgb_y - g_uv) >> SCALEBITS_OUT;
			r = (rgb_y + r_v) >> SCALEBITS_OUT;
			dst2[4] = MAX(0, MIN(255, r));
			dst2[5] = MAX(0, MIN(255, g));
			dst2[6] = MAX(0, MIN(255, b));
			dst2[7] = 0;
			y_src2++;

			dst += 8;
			dst2 += 8;
		}

		dst += dst_dif;
		dst2 += dst_dif;

		y_src += y_dif;
		y_src2 += y_dif;

		u_src += uv_stride;
		v_src += uv_stride;
	}
}


/*	yuv 4:2:0 planar -> yuv planar */

void
yv12_to_yuv_c(uint8_t * dst,
			  int dst_stride,
			  uint8_t * y_src,
			  uint8_t * u_src,
			  uint8_t * v_src,
			  int y_stride,
			  int uv_stride,
			  int width,
			  int height)
{
	uint32_t dst_stride2 = dst_stride >> 1;
	uint32_t width2 = width >> 1;
	uint32_t y;

	if (height < 0) {
		height = -height;
		y_src += (height - 1) * y_stride;
		u_src += (height / 2 - 1) * uv_stride;
		v_src += (height / 2 - 1) * uv_stride;
		y_stride = -y_stride;
		uv_stride = -uv_stride;
	}

	for (y = height; y; y--) {
		memcpy(dst, y_src, width);
		dst += dst_stride;
		y_src += y_stride;
	}

	for (y = height >> 1; y; y--) {
		memcpy(dst, u_src, width2);
		dst += dst_stride2;
		u_src += uv_stride;
	}

	for (y = height >> 1; y; y--) {
		memcpy(dst, v_src, width2);
		dst += dst_stride2;
		v_src += uv_stride;
	}
}


/* yuv 4:2:0 planar -> yuyv (yuv2) packed */

void
yv12_to_yuyv_c(uint8_t * dst,
			   int dst_stride,
			   uint8_t * y_src,
			   uint8_t * u_src,
			   uint8_t * v_src,
			   int y_stride,
			   int uv_stride,
			   int width,
			   int height)
{
	const uint32_t dst_dif = 2 * (dst_stride - width);
	uint32_t x, y;

	if (height < 0) {
		height = -height;
		y_src += (height - 1) * y_stride;
		u_src += (height / 2 - 1) * uv_stride;
		v_src += (height / 2 - 1) * uv_stride;
		y_stride = -y_stride;
		uv_stride = -uv_stride;
	}

	for (y = 0; y < (uint32_t) height; y++) {
		for (x = 0; x < (uint32_t) width / 2; x++) {
			dst[0] = y_src[2 * x];
			dst[1] = u_src[x];
			dst[2] = y_src[2 * x + 1];
			dst[3] = v_src[x];
			dst += 4;
		}
		dst += dst_dif;
		y_src += y_stride;
		if (y & 1) {
			u_src += uv_stride;
			v_src += uv_stride;
		}
	}
}


/* yuv 4:2:0 planar -> uyvy packed */

void
yv12_to_uyvy_c(uint8_t * dst,
			   int dst_stride,
			   uint8_t * y_src,
			   uint8_t * u_src,
			   uint8_t * v_src,
			   int y_stride,
			   int uv_stride,
			   int width,
			   int height)
{
	const uint32_t dst_dif = 2 * (dst_stride - width);
	uint32_t x, y;

	if (height < 0) {
		height = -height;
		y_src += (height - 1) * y_stride;
		u_src += (height / 2 - 1) * uv_stride;
		v_src += (height / 2 - 1) * uv_stride;
		y_stride = -y_stride;
		uv_stride = -uv_stride;
	}

	for (y = 0; y < (uint32_t) height; y++) {
		for (x = 0; x < (uint32_t) width / 2; x++) {
			dst[0] = u_src[x];
			dst[1] = y_src[2 * x];
			dst[2] = v_src[x];
			dst[3] = y_src[2 * x + 1];
			dst += 4;
		}
		dst += dst_dif;
		y_src += y_stride;
		if (y & 1) {
			u_src += uv_stride;
			v_src += uv_stride;
		}
	}
}


/*	user yuv planar -> yuv 4:2:0 planar
   
	NOTE: does not flip */

void
user_to_yuv_c(uint8_t * y_out,
			  uint8_t * u_out,
			  uint8_t * v_out,
			  int stride,
			  DEC_PICTURE * picture,
			  int width,
			  int height)
{
	uint32_t stride2 = stride >> 1;
	uint32_t width2 = width >> 1;
	uint32_t y;
	uint8_t *src;

	src = picture->y;
	for (y = height; y; y--) {
		memcpy(y_out, src, width);
		src += picture->stride_y;
		y_out += stride;
	}

	src = picture->u;
	for (y = height >> 1; y; y--) {
		memcpy(u_out, src, width2);
		src += picture->stride_uv;
		u_out += stride2;
	}

	src = picture->v;
	for (y = height >> 1; y; y--) {
		memcpy(v_out, src, width2);
		src += picture->stride_uv;
		v_out += stride2;
	}
}