--- branches/dev-api-3/xvidcore/src/motion/motion_est.c	2002/11/12 14:46:03	644
+++ branches/dev-api-3/xvidcore/src/motion/motion_est.c	2002/11/23 15:09:31	672
@@ -47,21 +47,11 @@
 #define FINAL_SKIP_THRESH	(50)
 #define MAX_SAD00_FOR_SKIP	(20)
 #define MAX_CHROMA_SAD_FOR_SKIP	(22)
-#define SKIP_THRESH_B (25)
+#define SKIP_THRESH_B (15)
 
 #define CHECK_CANDIDATE(X,Y,D) { \
 (*CheckCandidate)((const int)(X),(const int)(Y), (D), &iDirection, data ); }
 
-#define GET_REFERENCE(X, Y, REF) { \
-	switch ( (((X)&1)<<1) + ((Y)&1) ) \
-	{ \
-		case 0 : REF = (uint8_t *)data->Ref + (X)/2 + ((Y)/2)*(data->iEdgedWidth); break; \
-		case 1 : REF = (uint8_t *)data->RefV + (X)/2 + (((Y)-1)/2)*(data->iEdgedWidth); break; \
-		case 2 : REF = (uint8_t *)data->RefH + ((X)-1)/2 + ((Y)/2)*(data->iEdgedWidth); break; \
-		default : REF = (uint8_t *)data->RefHV + ((X)-1)/2 + (((Y)-1)/2)*(data->iEdgedWidth); break; \
-	} \
-}
-
 #define iDiamondSize 2
 
 static __inline int
@@ -126,49 +116,163 @@
 	return sad;
 }
 
+static __inline const uint8_t *
+GetReference(const int x, const int y, const int dir, const SearchData * const data)
+{
+//	dir : 0 = forward, 1 = backward
+	switch ( (dir << 2) | ((x&1)<<1) | (y&1) ) {
+		case 0 : return data->Ref + x/2 + (y/2)*(data->iEdgedWidth);
+		case 1 : return data->RefV + x/2 + ((y-1)/2)*(data->iEdgedWidth);
+		case 2 : return data->RefH + (x-1)/2 + (y/2)*(data->iEdgedWidth);
+		case 3 : return data->RefHV + (x-1)/2 + ((y-1)/2)*(data->iEdgedWidth);
+		case 4 : return data->bRef + x/2 + (y/2)*(data->iEdgedWidth);
+		case 5 : return data->bRefV + x/2 + ((y-1)/2)*(data->iEdgedWidth);
+		case 6 : return data->bRefH + (x-1)/2 + (y/2)*(data->iEdgedWidth);
+		default : return data->bRefHV + (x-1)/2 + ((y-1)/2)*(data->iEdgedWidth);
 
-/* CHECK_CANDIATE FUNCTIONS START */
+	}
+}
+
+static uint8_t * 
+Interpolate8x8qpel(const int x, const int y, const int block, const int dir, const SearchData * const data)
+{
+// create or find a qpel-precision reference picture; return pointer to it
+	uint8_t * Reference = (uint8_t *)data->RefQ + 16*dir;
+	const int32_t iEdgedWidth = data->iEdgedWidth;
+	const uint32_t rounding = data->rounding;
+	const int halfpel_x = x/2;
+	const int halfpel_y = y/2;
+	const uint8_t *ref1, *ref2, *ref3, *ref4;
+
+	ref1 = GetReference(halfpel_x, halfpel_y, dir, data); // this reference is used in all cases
+	ref1 += 8 * (block&1) + 8 * (block>>1) * iEdgedWidth;
+	switch( ((x&1)<<1) + (y&1) ) {
+	case 0: // pure halfpel position
+		Reference = (uint8_t *) GetReference(halfpel_x, halfpel_y, dir, data);
+		Reference += 8 * (block&1) + 8 * (block>>1) * iEdgedWidth;
+		break;
+
+	case 1: // x halfpel, y qpel - top or bottom during qpel refinement
+		ref2 = GetReference(halfpel_x, y - halfpel_y, dir, data);
+		ref2 += 8 * (block&1) + 8 * (block>>1) * iEdgedWidth;
+		interpolate8x8_avg2(Reference, ref1, ref2, iEdgedWidth, rounding, 8);
+		break;
+
+	case 2: // x qpel, y halfpel - left or right during qpel refinement
+		ref2 = GetReference(x - halfpel_x, halfpel_y, dir, data);
+		ref2 += 8 * (block&1) + 8 * (block>>1) * iEdgedWidth;
+		interpolate8x8_avg2(Reference, ref1, ref2, iEdgedWidth, rounding, 8);
+		break;
+
+	default: // x and y in qpel resolution - the "corners" (top left/right and
+			 // bottom left/right) during qpel refinement
+		ref2 = GetReference(halfpel_x, y - halfpel_y, dir, data);
+		ref3 = GetReference(x - halfpel_x, halfpel_y, dir, data);
+		ref4 = GetReference(x - halfpel_x, y - halfpel_y, dir, data);
+		ref2 += 8 * (block&1) + 8 * (block>>1) * iEdgedWidth;
+		ref3 += 8 * (block&1) + 8 * (block>>1) * iEdgedWidth;
+		ref4 += 8 * (block&1) + 8 * (block>>1) * iEdgedWidth;
+		interpolate8x8_avg4(Reference, ref1, ref2, ref3, ref4, iEdgedWidth, rounding);
+		break;
+	}
+	return Reference;
+}
 
+static uint8_t * 
+Interpolate16x16qpel(const int x, const int y, const int dir, const SearchData * const data)
+{
+// create or find a qpel-precision reference picture; return pointer to it
+	uint8_t * Reference = (uint8_t *)data->RefQ + 16*dir;
+	const int32_t iEdgedWidth = data->iEdgedWidth;
+	const uint32_t rounding = data->rounding;
+	const int halfpel_x = x/2;
+	const int halfpel_y = y/2;
+	const uint8_t *ref1, *ref2, *ref3, *ref4;
+
+	ref1 = GetReference(halfpel_x, halfpel_y, dir, data); // this reference is used in all cases
+	switch( ((x&1)<<1) + (y&1) ) {
+	case 0: // pure halfpel position
+		return (uint8_t *) GetReference(halfpel_x, halfpel_y, dir, data);
+	case 1: // x halfpel, y qpel - top or bottom during qpel refinement
+		ref2 = GetReference(halfpel_x, y - halfpel_y, dir, data);
+		interpolate8x8_avg2(Reference, ref1, ref2, iEdgedWidth, rounding, 8);
+		interpolate8x8_avg2(Reference+8, ref1+8, ref2+8, iEdgedWidth, rounding, 8);
+		interpolate8x8_avg2(Reference+8*iEdgedWidth, ref1+8*iEdgedWidth, ref2+8*iEdgedWidth, iEdgedWidth, rounding, 8);
+		interpolate8x8_avg2(Reference+8*iEdgedWidth+8, ref1+8*iEdgedWidth+8, ref2+8*iEdgedWidth+8, iEdgedWidth, rounding, 8);
+		break;
+
+	case 2: // x qpel, y halfpel - left or right during qpel refinement
+		ref2 = GetReference(x - halfpel_x, halfpel_y, dir, data);		
+		interpolate8x8_avg2(Reference, ref1, ref2, iEdgedWidth, rounding, 8);
+		interpolate8x8_avg2(Reference+8, ref1+8, ref2+8, iEdgedWidth, rounding, 8);
+		interpolate8x8_avg2(Reference+8*iEdgedWidth, ref1+8*iEdgedWidth, ref2+8*iEdgedWidth, iEdgedWidth, rounding, 8);
+		interpolate8x8_avg2(Reference+8*iEdgedWidth+8, ref1+8*iEdgedWidth+8, ref2+8*iEdgedWidth+8, iEdgedWidth, rounding, 8);
+		break;
+
+	default: // x and y in qpel resolution - the "corners" (top left/right and
+			 // bottom left/right) during qpel refinement
+		ref2 = GetReference(halfpel_x, y - halfpel_y, dir, data);
+		ref3 = GetReference(x - halfpel_x, halfpel_y, dir, data);
+		ref4 = GetReference(x - halfpel_x, y - halfpel_y, dir, data);
+		interpolate8x8_avg4(Reference, ref1, ref2, ref3, ref4, iEdgedWidth, rounding);
+		interpolate8x8_avg4(Reference+8, ref1+8, ref2+8, ref3+8, ref4+8, iEdgedWidth, rounding);
+		interpolate8x8_avg4(Reference+8*iEdgedWidth, ref1+8*iEdgedWidth, ref2+8*iEdgedWidth, ref3+8*iEdgedWidth, ref4+8*iEdgedWidth, iEdgedWidth, rounding);
+		interpolate8x8_avg4(Reference+8*iEdgedWidth+8, ref1+8*iEdgedWidth+8, ref2+8*iEdgedWidth+8, ref3+8*iEdgedWidth+8, ref4+8*iEdgedWidth+8, iEdgedWidth, rounding);
+		break;
+	}
+	return Reference;
+}
+
+/* CHECK_CANDIATE FUNCTIONS START */
 
 static void 
 CheckCandidate16(const int x, const int y, const int Direction, int * const dir, const SearchData * const data)
 {
-	int t;
+	int t, xc, yc;
 	const uint8_t * Reference;
+	VECTOR * current;
 
 	if (( x > data->max_dx) || ( x < data->min_dx)
 		|| ( y > data->max_dy) || (y < data->min_dy)) return;
 
-	switch ( ((x&1)<<1) + (y&1) ) {
-		case 0 : Reference = data->Ref + x/2 + (y/2)*(data->iEdgedWidth); break;
-		case 1 : Reference = data->RefV + x/2 + ((y-1)/2)*(data->iEdgedWidth); break;
-		case 2 : Reference = data->RefH + (x-1)/2 + (y/2)*(data->iEdgedWidth); break;
-		default : Reference = data->RefHV + (x-1)/2 + ((y-1)/2)*(data->iEdgedWidth); break;
+	if (data->qpel_precision) { // x and y are in 1/4 precision
+		Reference = Interpolate16x16qpel(x, y, 0, data);
+		t = d_mv_bits(x - data->predMV.x, y - data->predMV.y, data->iFcode);
+		xc = x/2; yc = y/2; //for chroma sad
+		current = data->currentQMV;
+	} else {
+		switch ( ((x&1)<<1) + (y&1) ) {
+			case 0 : Reference = data->Ref + x/2 + (y/2)*(data->iEdgedWidth); break;
+			case 1 : Reference = data->RefV + x/2 + ((y-1)/2)*(data->iEdgedWidth); break;
+			case 2 : Reference = data->RefH + (x-1)/2 + (y/2)*(data->iEdgedWidth); break;
+			default : Reference = data->RefHV + (x-1)/2 + ((y-1)/2)*(data->iEdgedWidth); break;
+		}
+		if (data->qpel) t = d_mv_bits(2*x - data->predMV.x, 2*y - data->predMV.y, data->iFcode);
+		else t = d_mv_bits(x - data->predMV.x, y - data->predMV.y, data->iFcode);
+		current = data->currentMV;
+		xc = x; yc = y;
 	}
 	
 	data->temp[0] = sad16v(data->Cur, Reference, data->iEdgedWidth, data->temp + 1);
 
-	if (data->qpel) t = d_mv_bits(2*x - data->predQMV.x, 2*y - data->predQMV.y, data->iFcode);
-	else t = d_mv_bits(x - data->predMV.x, y - data->predMV.y, data->iFcode);
-
 	data->temp[0] += (data->lambda16 * t * data->temp[0])/1000;
 	data->temp[1] += (data->lambda8 * t * (data->temp[1] + NEIGH_8X8_BIAS))/100;
 
-	if (data->chroma) data->temp[0] += ChromaSAD(x, y, data);
+	if (data->chroma) data->temp[0] += ChromaSAD(xc, yc, data);
 
 	if (data->temp[0] < data->iMinSAD[0]) {
 		data->iMinSAD[0] = data->temp[0];
-		data->currentMV[0].x = x; data->currentMV[0].y = y;
+		current[0].x = x; current[0].y = y;
 		*dir = Direction; }
 
 	if (data->temp[1] < data->iMinSAD[1]) {
-		data->iMinSAD[1] = data->temp[1]; data->currentMV[1].x = x; data->currentMV[1].y = y; }
+		data->iMinSAD[1] = data->temp[1]; current[1].x = x; current[1].y= y; }
 	if (data->temp[2] < data->iMinSAD[2]) {
-		data->iMinSAD[2] = data->temp[2]; data->currentMV[2].x = x; data->currentMV[2].y = y; }
+		data->iMinSAD[2] = data->temp[2]; current[2].x = x; current[2].y = y; }
 	if (data->temp[3] < data->iMinSAD[3]) {
-		data->iMinSAD[3] = data->temp[3]; data->currentMV[3].x = x; data->currentMV[3].y = y; }
+		data->iMinSAD[3] = data->temp[3]; current[3].x = x; current[3].y = y; }
 	if (data->temp[4] < data->iMinSAD[4]) {
-		data->iMinSAD[4] = data->temp[4]; data->currentMV[4].x = x; data->currentMV[4].y = y; }
+		data->iMinSAD[4] = data->temp[4]; current[4].x = x; current[4].y = y; }
 
 }
 
@@ -177,108 +281,41 @@
 {
 	int32_t sad;
 	const uint8_t * Reference;
+	int t;
+	VECTOR * current;
 
 	if (( x > data->max_dx) || ( x < data->min_dx)
 		|| ( y > data->max_dy) || (y < data->min_dy)) return;
 
-	switch ( ((x&1)<<1) + (y&1) )
-	{
-		case 0 : Reference = data->Ref + x/2 + (y/2)*(data->iEdgedWidth); break;
-		case 1 : Reference = data->RefV + x/2 + ((y-1)/2)*(data->iEdgedWidth); break;
-		case 2 : Reference = data->RefH + (x-1)/2 + (y/2)*(data->iEdgedWidth); break;
-		default : Reference = data->RefHV + (x-1)/2 + ((y-1)/2)*(data->iEdgedWidth); break;
+	if (data->qpel_precision) { // x and y are in 1/4 precision
+		Reference = Interpolate16x16qpel(x, y, 0, data);
+		t = d_mv_bits(x - data->predMV.x, y - data->predMV.y, data->iFcode);
+		current = data->currentQMV;
+	} else {
+		switch ( ((x&1)<<1) + (y&1) ) {
+			case 0 : Reference = data->Ref + x/2 + (y/2)*(data->iEdgedWidth); break;
+			case 1 : Reference = data->RefV + x/2 + ((y-1)/2)*(data->iEdgedWidth); break;
+			case 2 : Reference = data->RefH + (x-1)/2 + (y/2)*(data->iEdgedWidth); break;
+			default : Reference = data->RefHV + (x-1)/2 + ((y-1)/2)*(data->iEdgedWidth); break;
+		}
+		if (data->qpel) t = d_mv_bits(2*x - data->predMV.x, 2*y - data->predMV.y, data->iFcode);
+		else t = d_mv_bits(x - data->predMV.x, y - data->predMV.y, data->iFcode);
+		current = data->currentMV;
 	}
-
-	sad = sad16(data->Cur, Reference, data->iEdgedWidth, MV_MAX_ERROR);
-	sad += (data->lambda16 * d_mv_bits(x - data->predMV.x, y - data->predMV.y, data->iFcode) * sad)/1000;
+	
+	sad = sad16(data->Cur, Reference, data->iEdgedWidth, 256*4096);
+	sad += (data->lambda16 * t * sad)/1000;
 
 	if (sad < *(data->iMinSAD)) {
 		*(data->iMinSAD) = sad;
-		data->currentMV[0].x = x; data->currentMV[0].y = y;
+		current->x = x; current->y = y;
 		*dir = Direction; }
 }
 
 static void 
-CheckCandidate16_qpel(const int x, const int y, const int Direction, int * const dir, const SearchData * const data)
-
-// CheckCandidate16 variant which expects x and y in quarter pixel resolution
-// Important: This is no general usable routine! x and y must be +/-1 (qpel resolution!)
-// around currentMV!
-{
-	int t;
-	uint8_t * Reference = (uint8_t *)data->RefQ;
-	const uint8_t *ref1, *ref2, *ref3, *ref4;
-	VECTOR halfpelMV = *(data->currentMV);
-
-	int32_t iEdgedWidth = data->iEdgedWidth;
-	uint32_t rounding = data->rounding;
-
-	if (( x > data->max_dx) || ( x < data->min_dx)
-		|| ( y > data->max_dy) || (y < data->min_dy)) return;
-
-	GET_REFERENCE(halfpelMV.x, halfpelMV.y, ref1); // this refenrence is used in all cases
-	switch( ((x&1)<<1) + (y&1) )
-	{
-	case 0: // pure halfpel position - shouldn't happen during a refinement step
-		GET_REFERENCE(halfpelMV.x, halfpelMV.y, Reference); 
-		break;
-
-	case 1: // x halfpel, y qpel - top or bottom during qpel refinement
-		GET_REFERENCE(halfpelMV.x, y - halfpelMV.y, ref2);
-		interpolate8x8_avg2(Reference, ref1, ref2, iEdgedWidth, rounding);
-		interpolate8x8_avg2(Reference+8, ref1+8, ref2+8, iEdgedWidth, rounding);
-		interpolate8x8_avg2(Reference+8*iEdgedWidth, ref1+8*iEdgedWidth, ref2+8*iEdgedWidth, iEdgedWidth, rounding);
-		interpolate8x8_avg2(Reference+8*iEdgedWidth+8, ref1+8*iEdgedWidth+8, ref2+8*iEdgedWidth+8, iEdgedWidth, rounding);
-		break;
-
-	case 2: // x qpel, y halfpel - left or right during qpel refinement
-		GET_REFERENCE(x - halfpelMV.x, halfpelMV.y, ref2);
-		interpolate8x8_avg2(Reference, ref1, ref2, iEdgedWidth, rounding);
-		interpolate8x8_avg2(Reference+8, ref1+8, ref2+8, iEdgedWidth, rounding);
-		interpolate8x8_avg2(Reference+8*iEdgedWidth, ref1+8*iEdgedWidth, ref2+8*iEdgedWidth, iEdgedWidth, rounding);
-		interpolate8x8_avg2(Reference+8*iEdgedWidth+8, ref1+8*iEdgedWidth+8, ref2+8*iEdgedWidth+8, iEdgedWidth, rounding);
-		break;
-
-	default: // x and y in qpel resolution - the "corners" (top left/right and
-			 // bottom left/right) during qpel refinement
-		GET_REFERENCE(halfpelMV.x, y - halfpelMV.y, ref2);
-		GET_REFERENCE(x - halfpelMV.x, halfpelMV.y, ref3);
-		GET_REFERENCE(x - halfpelMV.x, y - halfpelMV.y, ref4);
-
-		interpolate8x8_avg4(Reference, ref1, ref2, ref3, ref4, iEdgedWidth, rounding);
-		interpolate8x8_avg4(Reference+8, ref1+8, ref2+8, ref3+8, ref4+8, iEdgedWidth, rounding);
-		interpolate8x8_avg4(Reference+8*iEdgedWidth, ref1+8*iEdgedWidth, ref2+8*iEdgedWidth, ref3+8*iEdgedWidth, ref4+8*iEdgedWidth, iEdgedWidth, rounding);
-		interpolate8x8_avg4(Reference+8*iEdgedWidth+8, ref1+8*iEdgedWidth+8, ref2+8*iEdgedWidth+8, ref3+8*iEdgedWidth+8, ref4+8*iEdgedWidth+8, iEdgedWidth, rounding);
-		break;
-	}
-	
-	data->temp[0] = sad16v(data->Cur, Reference, data->iEdgedWidth, data->temp+1);
-
-	t = d_mv_bits(x - data->predQMV.x, y - data->predQMV.y, data->iFcode);
-	data->temp[0] += (data->lambda16 * t * data->temp[0])/1000;
-	data->temp[1] += (data->lambda8 * t * (data->temp[1] + NEIGH_8X8_BIAS))/100;
-
-	if (data->chroma)
-		data->temp[0] += ChromaSAD(x/2, y/2, data);
-
-	if (data->temp[0] < data->iMinSAD[0]) {
-		data->iMinSAD[0] = data->temp[0];
-		data->currentQMV[0].x = x; data->currentQMV[0].y = y;
-	/*	*dir = Direction;*/ }
-
-	if (data->temp[1] < data->iMinSAD[1]) {
-		data->iMinSAD[1] = data->temp[1]; data->currentQMV[1].x = x; data->currentQMV[1].y = y; }
-	if (data->temp[2] < data->iMinSAD[2]) {
-		data->iMinSAD[2] = data->temp[2]; data->currentQMV[2].x = x; data->currentQMV[2].y = y; }
-	if (data->temp[3] < data->iMinSAD[3]) {
-		data->iMinSAD[3] = data->temp[3]; data->currentQMV[3].x = x; data->currentQMV[3].y = y; }
-	if (data->temp[4] < data->iMinSAD[4]) {
-		data->iMinSAD[4] = data->temp[4]; data->currentQMV[4].x = x; data->currentQMV[4].y = y; }
-}
-
-static void 
 CheckCandidate16no4vI(const int x, const int y, const int Direction, int * const dir, const SearchData * const data)
 {
+// maximum speed - for P/B/I decision
 	int32_t sad;
 
 	if (( x > data->max_dx) || ( x < data->min_dx)
@@ -298,36 +335,39 @@
 CheckCandidateInt(const int xf, const int yf, const int Direction, int * const dir, const SearchData * const data)
 {
 	int32_t sad;
-	const int xb = data->currentMV[1].x;
-	const int yb = data->currentMV[1].y;
+	int xb, yb, t;
 	const uint8_t *ReferenceF, *ReferenceB;
+	VECTOR *current;
 
 	if (( xf > data->max_dx) || ( xf < data->min_dx)
 		|| ( yf > data->max_dy) || (yf < data->min_dy)) return;
 
-	switch ( ((xf&1)<<1) + (yf&1) ) {
-		case 0 : ReferenceF = data->Ref + xf/2 + (yf/2)*(data->iEdgedWidth); break;
-		case 1 : ReferenceF = data->RefV + xf/2 + ((yf-1)/2)*(data->iEdgedWidth); break;
-		case 2 : ReferenceF = data->RefH + (xf-1)/2 + (yf/2)*(data->iEdgedWidth); break;
-		default : ReferenceF = data->RefHV + (xf-1)/2 + ((yf-1)/2)*(data->iEdgedWidth); break;
-	}
-
-	switch ( ((xb&1)<<1) + (yb&1) ) {
-		case 0 : ReferenceB = data->bRef + xb/2 + (yb/2)*(data->iEdgedWidth); break;
-		case 1 : ReferenceB = data->bRefV + xb/2 + ((yb-1)/2)*(data->iEdgedWidth); break;
-		case 2 : ReferenceB = data->bRefH + (xb-1)/2 + (yb/2)*(data->iEdgedWidth); break;
-		default : ReferenceB = data->bRefHV + (xb-1)/2 + ((yb-1)/2)*(data->iEdgedWidth); break;
+	if (data->qpel_precision) {
+		ReferenceF = Interpolate16x16qpel(xf, yf, 0, data);
+		xb = data->currentQMV[1].x; yb = data->currentQMV[1].y;
+		current = data->currentQMV;
+		ReferenceB = Interpolate16x16qpel(xb, yb, 1, data);
+		t = d_mv_bits(xf - data->predMV.x, yf - data->predMV.y, data->iFcode)
+				 + d_mv_bits(xb - data->bpredMV.x, yb - data->bpredMV.y, data->iFcode);
+	} else {
+		ReferenceF = Interpolate16x16qpel(2*xf, 2*yf, 0, data);
+		xb = data->currentMV[1].x; yb = data->currentMV[1].y;
+		ReferenceB = Interpolate16x16qpel(2*xb, 2*yb, 1, data);
+		current = data->currentMV;
+		if (data->qpel)
+			t = d_mv_bits(2*xf - data->predMV.x, 2*yf - data->predMV.y, data->iFcode)
+					 + d_mv_bits(2*xb - data->bpredMV.x, 2*yb - data->bpredMV.y, data->iFcode);
+		else
+			t = d_mv_bits(xf - data->predMV.x, yf - data->predMV.y, data->iFcode)
+					 + d_mv_bits(xb - data->bpredMV.x, yb - data->bpredMV.y, data->iFcode);
 	}
 
 	sad = sad16bi(data->Cur, ReferenceF, ReferenceB, data->iEdgedWidth);
-
-	sad += (data->lambda16 *
-			( d_mv_bits(xf - data->predMV.x, yf - data->predMV.y, data->iFcode) + 
-			  d_mv_bits(xb - data->bpredMV.x, yb - data->bpredMV.y, data->iFcode)) * sad)/1000;
+	sad += (data->lambda16 * t * sad)/1000;
 
 	if (sad < *(data->iMinSAD)) {
 		*(data->iMinSAD) = sad;
-		data->currentMV->x = xf; data->currentMV->y = yf;
+		current->x = xf; current->y = yf;
 		*dir = Direction; }
 }
 
@@ -358,23 +398,15 @@
 			|| ( b_mvs.x > data->max_dx ) || ( b_mvs.x < data->min_dx )
 			|| ( b_mvs.y > data->max_dy ) || ( b_mvs.y < data->min_dy )) return;
 
-		switch ( ((mvs.x&1)<<1) + (mvs.y&1) ) {
-			case 0 : ReferenceF = data->Ref + mvs.x/2 + (mvs.y/2)*(data->iEdgedWidth); break;
-			case 1 : ReferenceF = data->RefV + mvs.x/2 + ((mvs.y-1)/2)*(data->iEdgedWidth); break;
-			case 2 : ReferenceF = data->RefH + (mvs.x-1)/2 + (mvs.y/2)*(data->iEdgedWidth); break;
-			default : ReferenceF = data->RefHV + (mvs.x-1)/2 + ((mvs.y-1)/2)*(data->iEdgedWidth); break;
-		}
-
-		switch ( ((b_mvs.x&1)<<1) + (b_mvs.y&1) ) {
-			case 0 : ReferenceB = data->bRef + b_mvs.x/2 + (b_mvs.y/2)*(data->iEdgedWidth); break;
-			case 1 : ReferenceB = data->bRefV + b_mvs.x/2 + ((b_mvs.y-1)/2)*(data->iEdgedWidth); break;
-			case 2 : ReferenceB = data->bRefH + (b_mvs.x-1)/2 + (b_mvs.y/2)*(data->iEdgedWidth); break;
-			default : ReferenceB = data->bRefHV + (b_mvs.x-1)/2 + ((b_mvs.y-1)/2)*(data->iEdgedWidth); break;
+		if (!data->qpel) { 
+			mvs.x *= 2; mvs.y *= 2; 
+			b_mvs.x *= 2; b_mvs.y *= 2; //we move to qpel precision anyway
 		}
+		ReferenceF = Interpolate8x8qpel(mvs.x, mvs.y, k, 0, data);
+		ReferenceB = Interpolate8x8qpel(b_mvs.x, b_mvs.y, k, 1, data);
 	
 		sad += sad8bi(data->Cur + 8*(k&1) + 8*(k>>1)*(data->iEdgedWidth),
-						ReferenceF + 8*(k&1) + 8*(k>>1)*(data->iEdgedWidth),
-						ReferenceB + 8*(k&1) + 8*(k>>1)*(data->iEdgedWidth),
+						ReferenceF, ReferenceB,
 						data->iEdgedWidth);
 		if (sad > *(data->iMinSAD)) return;
 	}
@@ -412,20 +444,13 @@
 		|| ( b_mvs.x > data->max_dx ) || ( b_mvs.x < data->min_dx )
 		|| ( b_mvs.y > data->max_dy ) || ( b_mvs.y < data->min_dy )) return;
 
-	switch ( ((mvs.x&1)<<1) + (mvs.y&1) ) {
-		case 0 : ReferenceF = data->Ref + mvs.x/2 + (mvs.y/2)*(data->iEdgedWidth); break;
-		case 1 : ReferenceF = data->RefV + mvs.x/2 + ((mvs.y-1)/2)*(data->iEdgedWidth); break;
-		case 2 : ReferenceF = data->RefH + (mvs.x-1)/2 + (mvs.y/2)*(data->iEdgedWidth); break;
-		default : ReferenceF = data->RefHV + (mvs.x-1)/2 + ((mvs.y-1)/2)*(data->iEdgedWidth); break;
-	}
+	if (!data->qpel) { 
+			mvs.x *= 2; mvs.y *= 2; 
+			b_mvs.x *= 2; b_mvs.y *= 2; //we move to qpel precision anyway
+		}
+	ReferenceF = Interpolate16x16qpel(mvs.x, mvs.y, 0, data);
+	ReferenceB = Interpolate16x16qpel(b_mvs.x, b_mvs.y, 1, data);
 
-	switch ( ((b_mvs.x&1)<<1) + (b_mvs.y&1) ) {
-		case 0 : ReferenceB = data->bRef + b_mvs.x/2 + (b_mvs.y/2)*(data->iEdgedWidth); break;
-		case 1 : ReferenceB = data->bRefV + b_mvs.x/2 + ((b_mvs.y-1)/2)*(data->iEdgedWidth); break;
-		case 2 : ReferenceB = data->bRefH + (b_mvs.x-1)/2 + (b_mvs.y/2)*(data->iEdgedWidth); break;
-		default : ReferenceB = data->bRefHV + (b_mvs.x-1)/2 + ((b_mvs.y-1)/2)*(data->iEdgedWidth); break;
-	}
-	
 	sad = sad16bi(data->Cur, ReferenceF, ReferenceB, data->iEdgedWidth);
 	sad += (data->lambda16 * d_mv_bits(x, y, 1) * sad)/1000;
 
@@ -444,16 +469,11 @@
 	if (( x > data->max_dx) || ( x < data->min_dx)
 		|| ( y > data->max_dy) || (y < data->min_dy)) return;
 
-	switch ( ((x&1)<<1) + (y&1) )
-	{
-		case 0 : Reference = data->Ref + x/2 + (y/2)*(data->iEdgedWidth); break;
-		case 1 : Reference = data->RefV + x/2 + ((y-1)/2)*(data->iEdgedWidth); break;
-		case 2 : Reference = data->RefH + (x-1)/2 + (y/2)*(data->iEdgedWidth); break;
-		default : Reference = data->RefHV + (x-1)/2 + ((y-1)/2)*(data->iEdgedWidth); break;
-	}
+	if (data->qpel) Reference = Interpolate16x16qpel(x, y, 0, data);
+	else Reference = Interpolate16x16qpel(2*x, 2*y, 0, data);
 
 	sad = sad8(data->Cur, Reference, data->iEdgedWidth);
-	if (data->qpel) t = d_mv_bits(2 * x - data->predQMV.x, 2 * y - data->predQMV.y, data->iFcode);
+	if (data->qpel) t = d_mv_bits(2 * x - data->predMV.x, 2 * y - data->predMV.y, data->iFcode);
 	else t = d_mv_bits(x - data->predMV.x, y - data->predMV.y, data->iFcode);
 
 	sad += (data->lambda8 * t * (sad+NEIGH_8X8_BIAS))/100;
@@ -464,62 +484,6 @@
 		*dir = Direction; }
 }
 
-static void
-CheckCandidate8_qpel(const int x, const int y, const int Direction, int * const dir, const SearchData * const data)
-// CheckCandidate16no4v variant which expects x and y in quarter pixel resolution
-// Important: This is no general usable routine! x and y must be +/-1 (qpel resolution!)
-// around currentMV!
-
-{
-	int32_t sad;
-	uint8_t *Reference = (uint8_t *) data->RefQ;
-	const uint8_t *ref1, *ref2, *ref3, *ref4;
-	VECTOR halfpelMV = *(data->currentMV);
-
-	int32_t iEdgedWidth = data->iEdgedWidth;
-	uint32_t rounding = data->rounding;
-
-	if (( x > data->max_dx) || ( x < data->min_dx)
-		|| ( y > data->max_dy) || (y < data->min_dy)) return;
-
-	GET_REFERENCE(halfpelMV.x, halfpelMV.y, ref1);
-	switch( ((x&1)<<1) + (y&1) )
-	{
-	case 0: // pure halfpel position - shouldn't happen during a refinement step
-		GET_REFERENCE(halfpelMV.x, halfpelMV.y, Reference); 
-		break;
-
-	case 1: // x halfpel, y qpel - top or bottom during qpel refinement
-		GET_REFERENCE(halfpelMV.x, y - halfpelMV.y, ref2);
-
-		interpolate8x8_avg2(Reference, ref1, ref2, iEdgedWidth, rounding);
-		break;
-
-	case 2: // x qpel, y halfpel - left or right during qpel refinement
-		GET_REFERENCE(x - halfpelMV.x, halfpelMV.y, ref2);
-
-		interpolate8x8_avg2(Reference, ref1, ref2, iEdgedWidth, rounding);
-		break;
-
-	default: // x and y in qpel resolution - the "corners" (top left/right and
-			 // bottom left/right) during qpel refinement
-		GET_REFERENCE(halfpelMV.x, y - halfpelMV.y, ref2);
-		GET_REFERENCE(x - halfpelMV.x, halfpelMV.y, ref3);
-		GET_REFERENCE(x - halfpelMV.x, y - halfpelMV.y, ref4);
-
-		interpolate8x8_avg4(Reference, ref1, ref2, ref3, ref4, iEdgedWidth, rounding);
-		break;
-	}
-
-	sad = sad8(data->Cur, Reference, data->iEdgedWidth);
-	sad += (data->lambda8 * d_mv_bits(x - data->predQMV.x, y - data->predQMV.y, data->iFcode) * (sad+NEIGH_8X8_BIAS))/100;
-
-	if (sad < *(data->iMinSAD)) {
-		*(data->iMinSAD) = sad;
-		data->currentQMV->x = x; data->currentQMV->y = y;
-		*dir = Direction; }
-}
-
 /* CHECK_CANDIATE FUNCTIONS END */
 
 /* MAINSEARCH FUNCTIONS START */
@@ -669,33 +633,15 @@
 /* HALFPELREFINE COULD BE A MAINSEARCH FUNCTION, BUT THERE IS NO NEED FOR IT */
 
 static void
-HalfpelRefine(const SearchData * const data)
+SubpelRefine(const SearchData * const data)
 {
-/* Do a half-pel refinement (or rather a "smallest possible amount" refinement) */
-
-	VECTOR backupMV = *(data->currentMV);
+/* Do a half-pel or q-pel refinement */
+	VECTOR backupMV;
 	int iDirection; //not needed
 
-	CHECK_CANDIDATE(backupMV.x - 1, backupMV.y - 1, 0);
-	CHECK_CANDIDATE(backupMV.x + 1, backupMV.y - 1, 0);
-	CHECK_CANDIDATE(backupMV.x - 1, backupMV.y + 1, 0);
-	CHECK_CANDIDATE(backupMV.x + 1, backupMV.y + 1, 0);
-
-	CHECK_CANDIDATE(backupMV.x - 1, backupMV.y, 0);
-	CHECK_CANDIDATE(backupMV.x + 1, backupMV.y, 0);
-
-	CHECK_CANDIDATE(backupMV.x, backupMV.y + 1, 0);
-	CHECK_CANDIDATE(backupMV.x, backupMV.y - 1, 0);
-}
-
-
-static void
-QuarterpelRefine(const SearchData * const data)
-{
-/* Perform quarter pixel refinement*/
-
-	VECTOR backupMV = *(data->currentQMV);
-	int iDirection; //not needed
+	if (data->qpel_precision)
+		backupMV = *(data->currentQMV);
+	else backupMV = *(data->currentMV);
 
 	CHECK_CANDIDATE(backupMV.x - 1, backupMV.y - 1, 0);
 	CHECK_CANDIDATE(backupMV.x + 1, backupMV.y - 1, 0);
@@ -707,7 +653,6 @@
 
 	CHECK_CANDIDATE(backupMV.x, backupMV.y + 1, 0);
 	CHECK_CANDIDATE(backupMV.x, backupMV.y - 1, 0);
-
 }
 
 static __inline int
@@ -959,8 +904,6 @@
 	get_range(&Data->min_dx, &Data->max_dx, &Data->min_dy, &Data->max_dy, x, y, 16,
 				pParam->width, pParam->height, Data->iFcode, pParam->m_quarterpel);
 
-	Data->predMV = pmv[0];
-
 	Data->Cur = pCur->y + (x + y * Data->iEdgedWidth) * 16;
 	Data->CurV = pCur->v + (x + y * (Data->iEdgedWidth/2)) * 8;
 	Data->CurU = pCur->u + (x + y * (Data->iEdgedWidth/2)) * 8;
@@ -974,6 +917,7 @@
 
 	Data->lambda16 = lambda_vec16[iQuant];
 	Data->lambda8 = lambda_vec8[iQuant];
+	Data->qpel_precision = 0;
 
 	if (!(MotionFlags & PMV_HALFPEL16)) {
 		Data->min_dx = EVEN(Data->min_dx);
@@ -986,11 +930,10 @@
 	for(i = 0;  i < 5; i++)
 		Data->currentMV[i].x = Data->currentMV[i].y = 0;
 
-	if (pParam->m_quarterpel) {
-		Data->predQMV = get_qpmv2(pMBs, pParam->mb_width, 0, x, y, 0);
-		i = d_mv_bits(Data->predQMV.x, Data->predQMV.y, Data->iFcode);
-	} else i = d_mv_bits(Data->predMV.x, Data->predMV.y, Data->iFcode);
+	if (pParam->m_quarterpel) Data->predMV = get_qpmv2(pMBs, pParam->mb_width, 0, x, y, 0);
+	else Data->predMV = pmv[0];
 
+	i = d_mv_bits(Data->predMV.x, Data->predMV.y, Data->iFcode);
 	Data->iMinSAD[0] = pMB->sad16 + (Data->lambda16 * i * pMB->sad16)/1000;
 	Data->iMinSAD[1] = pMB->sad8[0] + (Data->lambda8 * i * (pMB->sad8[0]+NEIGH_8X8_BIAS))/100;
 	Data->iMinSAD[2] = pMB->sad8[1];
@@ -1006,8 +949,8 @@
 	PreparePredictionsP(pmv, x, y, pParam->mb_width, pParam->mb_height,
 					prevMBs + x + y * pParam->mb_width);
 
-	if (inter4v || pParam->m_quarterpel || Data->chroma) CheckCandidate = CheckCandidate16;
-	else CheckCandidate = CheckCandidate16no4v;
+	if (inter4v || Data->chroma) CheckCandidate = CheckCandidate16;
+	else CheckCandidate = CheckCandidate16no4v; //for extra speed
 
 /* main loop. checking all predictions */
 
@@ -1064,7 +1007,7 @@
 		}
 	}
 
-	if (MotionFlags & PMV_HALFPELREFINE16) HalfpelRefine(Data);
+	if (MotionFlags & PMV_HALFPELREFINE16) SubpelRefine(Data);
 
 	for(i = 0; i < 5; i++) {
 		Data->currentQMV[i].x = 2 * Data->currentMV[i].x; // initialize qpel vectors
@@ -1073,11 +1016,11 @@
 
 	if((pParam->m_quarterpel) && (MotionFlags & PMV_QUARTERPELREFINE16)) {
 
-		CheckCandidate = CheckCandidate16_qpel;
+		Data->qpel_precision = 1;
 		get_range(&Data->min_dx, &Data->max_dx, &Data->min_dy, &Data->max_dy, x, y, 16,
 				pParam->width, pParam->height, Data->iFcode, 0);
 
-		QuarterpelRefine(Data);
+		SubpelRefine(Data);
 	}
 
 	if (Data->iMinSAD[0] < (int32_t)iQuant * 30 ) inter4v = 0;
@@ -1094,17 +1037,17 @@
 		Search8(Data, 2*x + 1, 2*y + 1, MotionFlags, pParam, pMB, pMBs, 3, &Data8);
 		
 		if (Data->chroma) {
-			int sum, dx, dy;
-
-			if(pParam->m_quarterpel) {
-				sum = pMB->qmvs[0].y/2 + pMB->qmvs[1].y/2 + pMB->qmvs[2].y/2 + pMB->qmvs[3].y/2;
-			} else sum = pMB->mvs[0].y + pMB->mvs[1].y + pMB->mvs[2].y + pMB->mvs[3].y;
-			dy = (sum >> 3) + roundtab_76[sum & 0xf];
+			int sumx, sumy, dx, dy;
 
 			if(pParam->m_quarterpel) {
-				sum = pMB->qmvs[0].x/2 + pMB->qmvs[1].x/2 + pMB->qmvs[2].x/2 + pMB->qmvs[3].x/2;
-			} else sum = pMB->mvs[0].x + pMB->mvs[1].x + pMB->mvs[2].x + pMB->mvs[3].x;
-			dx = (sum >> 3) + roundtab_76[sum & 0xf];
+				sumx= pMB->qmvs[0].x/2 + pMB->qmvs[1].x/2 + pMB->qmvs[2].x/2 + pMB->qmvs[3].x/2;
+				sumy = pMB->qmvs[0].y/2 + pMB->qmvs[1].y/2 + pMB->qmvs[2].y/2 + pMB->qmvs[3].y/2;
+			} else {
+				sumx = pMB->mvs[0].x + pMB->mvs[1].x + pMB->mvs[2].x + pMB->mvs[3].x;
+				sumy = pMB->mvs[0].y + pMB->mvs[1].y + pMB->mvs[2].y + pMB->mvs[3].y;
+			}
+			dx = (sumx >> 3) + roundtab_76[sumx & 0xf];
+			dy = (sumy >> 3) + roundtab_76[sumy & 0xf];
 			
 			Data->iMinSAD[1] += ChromaSAD(dx, dy, Data);
 		}
@@ -1118,15 +1061,14 @@
 		pMB->mvs[0] = pMB->mvs[1]
 			= pMB->mvs[2] = pMB->mvs[3] = Data->currentMV[0];
 
-		pMB->qmvs[0] = pMB->qmvs[1]
-			= pMB->qmvs[2] = pMB->qmvs[3] = Data->currentQMV[0];
-
 		pMB->sad16 = pMB->sad8[0] = pMB->sad8[1] =
 			pMB->sad8[2] = pMB->sad8[3] =  Data->iMinSAD[0];
 
 		if(pParam->m_quarterpel) {
-			pMB->pmvs[0].x = Data->currentQMV[0].x - Data->predQMV.x;
-			pMB->pmvs[0].y = Data->currentQMV[0].y - Data->predQMV.y;
+			pMB->qmvs[0] = pMB->qmvs[1]
+				= pMB->qmvs[2] = pMB->qmvs[3] = Data->currentQMV[0];
+			pMB->pmvs[0].x = Data->currentQMV[0].x - Data->predMV.x;
+			pMB->pmvs[0].y = Data->currentQMV[0].y - Data->predMV.y;
 		} else {
 			pMB->pmvs[0].x = Data->currentMV[0].x - Data->predMV.x;
 			pMB->pmvs[0].y = Data->currentMV[0].y - Data->predMV.y;
@@ -1154,10 +1096,10 @@
 	Data->currentQMV = OldData->currentQMV + 1 + block;
 
 	if(pParam->m_quarterpel) {
-		Data->predQMV = get_qpmv2(pMBs, pParam->mb_width, 0, x/2 , y/2, block);
+		Data->predMV = get_qpmv2(pMBs, pParam->mb_width, 0, x/2 , y/2, block);
 		if (block != 0)	*(Data->iMinSAD) += (Data->lambda8 *
-									d_mv_bits(	Data->currentQMV->x - Data->predQMV.x, 
-												Data->currentQMV->y - Data->predQMV.y,
+									d_mv_bits(	Data->currentQMV->x - Data->predMV.x, 
+												Data->currentQMV->y - Data->predMV.y,
 												Data->iFcode) * (*Data->iMinSAD + NEIGH_8X8_BIAS))/100;
 	} else {
 		Data->predMV = get_pmv2(pMBs, pParam->mb_width, 0, x/2 , y/2, block);
@@ -1175,6 +1117,7 @@
 		Data->RefHV = OldData->RefHV + 8 * ((block&1) + pParam->edged_width*(block>>1));
 
 		Data->Cur = OldData->Cur + 8 * ((block&1) + pParam->edged_width*(block>>1));
+		Data->qpel_precision = 0;
 		
 		get_range(&Data->min_dx, &Data->max_dx, &Data->min_dy, &Data->max_dy, x, y, 8,
 				pParam->width, pParam->height, OldData->iFcode, pParam->m_quarterpel);
@@ -1199,7 +1142,7 @@
 		if (MotionFlags & PMV_HALFPELREFINE8) {
 			int32_t temp_sad = *(Data->iMinSAD); // store current MinSAD
 
-			HalfpelRefine(Data); // perform halfpel refine of current best vector
+			SubpelRefine(Data); // perform halfpel refine of current best vector
 
 			if(*(Data->iMinSAD) < temp_sad) { // we have found a better match
 				Data->currentQMV->x = 2 * Data->currentMV->x; // update our qpel vector
@@ -1210,17 +1153,18 @@
 		if(pParam->m_quarterpel) {
 			if((!(Data->currentQMV->x & 1)) && (!(Data->currentQMV->y & 1)) &&
 				(MotionFlags & PMV_QUARTERPELREFINE8)) {
-			CheckCandidate = CheckCandidate8_qpel;
+			Data->qpel_precision = 1;
 			get_range(&Data->min_dx, &Data->max_dx, &Data->min_dy, &Data->max_dy, x, y, 8,
-				pParam->width, pParam->height, OldData->iFcode, pParam->m_quarterpel);
-			QuarterpelRefine(Data);
+				pParam->width, pParam->height, OldData->iFcode, 0);
+			SubpelRefine(Data);
 			}
 		}
 	}
 
 	if(pParam->m_quarterpel) {
-		pMB->pmvs[block].x = Data->currentQMV->x - Data->predQMV.x;
-		pMB->pmvs[block].y = Data->currentQMV->y - Data->predQMV.y;
+		pMB->pmvs[block].x = Data->currentQMV->x - Data->predMV.x;
+		pMB->pmvs[block].y = Data->currentQMV->y - Data->predMV.y;
+		pMB->qmvs[block] = *(Data->currentQMV);
 	}
 	else {
 		pMB->pmvs[block].x = Data->currentMV->x - Data->predMV.x;
@@ -1228,8 +1172,6 @@
 	}
 
 	pMB->mvs[block] = *(Data->currentMV);
-	pMB->qmvs[block] = *(Data->currentQMV);
-
 	pMB->sad8[block] =  4 * (*Data->iMinSAD);
 }
 
@@ -1307,6 +1249,7 @@
 	MainSearchFunc *MainSearchPtr;
 	*Data->iMinSAD = MV_MAX_ERROR;
 	Data->iFcode = iFcode;
+	Data->qpel_precision = 0;
 
 	Data->Ref = pRef + (x + y * iEdgedWidth) * 16;
 	Data->RefH = pRefH + (x + y * iEdgedWidth) * 16;
@@ -1319,10 +1262,10 @@
 				pParam->width, pParam->height, iFcode, pParam->m_quarterpel);
 
 	pmv[0] = Data->predMV;
+	if (Data->qpel) { pmv[0].x /= 2; pmv[0].y /= 2; }
 	PreparePredictionsBF(pmv, x, y, pParam->mb_width, pMB, mode_current);
 
 	Data->currentMV->x = Data->currentMV->y = 0;
-
 	CheckCandidate = CheckCandidate16no4v;
 
 // main loop. checking all predictions
@@ -1339,7 +1282,16 @@
 
 	(*MainSearchPtr)(Data->currentMV->x, Data->currentMV->y, Data, 255);
 
-	HalfpelRefine(Data);
+	SubpelRefine(Data);
+	
+	if (Data->qpel) {
+		Data->currentQMV->x = 2*Data->currentMV->x;
+		Data->currentQMV->y = 2*Data->currentMV->y;
+		Data->qpel_precision = 1;
+		get_range(&Data->min_dx, &Data->max_dx, &Data->min_dy, &Data->max_dy, x, y, 16,
+					pParam->width, pParam->height, iFcode, 0);
+		SubpelRefine(Data);
+	}
 
 // three bits are needed to code backward mode. four for forward
 // we treat the bits just like they were vector's
@@ -1349,10 +1301,22 @@
 	if (*Data->iMinSAD < *best_sad) {
 		*best_sad = *Data->iMinSAD;
 		pMB->mode = mode_current;
-		pMB->pmvs[0].x = Data->currentMV->x - predMV->x;
-		pMB->pmvs[0].y = Data->currentMV->y - predMV->y;
-		if (mode_current == MODE_FORWARD) pMB->mvs[0] = *(Data->currentMV+2) = *Data->currentMV;
-		else pMB->b_mvs[0] = *(Data->currentMV+1) = *Data->currentMV; //we store currmv for interpolate search
+		if (Data->qpel) {
+			pMB->pmvs[0].x = Data->currentQMV->x - predMV->x;
+			pMB->pmvs[0].y = Data->currentQMV->y - predMV->y;
+			if (mode_current == MODE_FORWARD) 
+				pMB->qmvs[0] = *Data->currentQMV;
+			else 
+				pMB->b_qmvs[0] = *Data->currentQMV;
+		} else {
+			pMB->pmvs[0].x = Data->currentMV->x - predMV->x;
+			pMB->pmvs[0].y = Data->currentMV->y - predMV->y;
+		}
+		if (mode_current == MODE_FORWARD) 
+			pMB->mvs[0] = *(Data->currentMV+2) = *Data->currentMV;
+		else 
+			pMB->b_mvs[0] = *(Data->currentMV+1) = *Data->currentMV; //we store currmv for interpolate search
+
 	}
 	
 }
@@ -1383,7 +1347,6 @@
 	MainSearchFunc *MainSearchPtr;
 
 	*Data->iMinSAD = 256*4096;
-	Data->referencemv = b_mb->mvs;
 
 	Data->Ref = f_Ref->y + (x + Data->iEdgedWidth*y) * 16;
 	Data->RefH = f_RefH + (x + Data->iEdgedWidth*y) * 16;
@@ -1398,6 +1361,14 @@
 	Data->max_dy = 2 * pParam->height - 2 * (y) * 16;
 	Data->min_dx = -(2 * 16 + 2 * (x) * 16);
 	Data->min_dy = -(2 * 16 + 2 * (y) * 16);
+	if (Data->qpel) { //we measure in qpixels
+		Data->max_dx *= 2;
+		Data->max_dy *= 2;
+		Data->min_dx *= 2;
+		Data->min_dy *= 2;
+		Data->referencemv = b_mb->qmvs;
+	} else Data->referencemv = b_mb->mvs;
+	Data->qpel_precision = 0; // it's a trick. it's 1 not 0, but we need 0 here
 
 	for (k = 0; k < 4; k++) {
 		pMB->mvs[k].x = Data->directmvF[k].x = ((TRB * Data->referencemv[k].x) / TRD);
@@ -1422,10 +1393,10 @@
 		}
 	}
 
-	if (b_mb->mode == MODE_INTER4V) 
-		CheckCandidate = CheckCandidateDirect;
+	
+	if (b_mb->mode == MODE_INTER4V) CheckCandidate = CheckCandidateDirect;
 	else CheckCandidate = CheckCandidateDirectno4v;
-
+		
 	(*CheckCandidate)(0, 0, 255, &k, Data);
 
 // skip decision
@@ -1434,18 +1405,28 @@
 		//this is not full chroma compensation, only it's fullpel approximation. should work though
 		int sum, dx, dy, b_dx, b_dy;
 
-		sum = pMB->mvs[0].x + pMB->mvs[1].x + pMB->mvs[2].x + pMB->mvs[3].x;
-		dx = (sum == 0 ? 0 : SIGN(sum) * (roundtab[ABS(sum) % 16] + (ABS(sum) / 16) * 2));
-
-		sum = pMB->mvs[0].y + pMB->mvs[1].y + pMB->mvs[2].y + pMB->mvs[3].y;
-		dy = (sum == 0 ? 0 : SIGN(sum) * (roundtab[ABS(sum) % 16] + (ABS(sum) / 16) * 2));
-
-		sum = pMB->b_mvs[0].x + pMB->b_mvs[1].x + pMB->b_mvs[2].x + pMB->b_mvs[3].x;
-		b_dx = (sum == 0 ? 0 : SIGN(sum) * (roundtab[ABS(sum) % 16] + (ABS(sum) / 16) * 2));
+		if (Data->qpel) {
+			sum = pMB->mvs[0].y/2 + pMB->mvs[1].y/2 + pMB->mvs[2].y/2 + pMB->mvs[3].y/2;
+			dy = (sum >> 3) + roundtab_76[sum & 0xf];
+			sum = pMB->mvs[0].x/2 + pMB->mvs[1].x/2 + pMB->mvs[2].x/2 + pMB->mvs[3].x/2;
+			dx = (sum >> 3) + roundtab_76[sum & 0xf];
 
-		sum = pMB->b_mvs[0].y + pMB->b_mvs[1].y + pMB->b_mvs[2].y + pMB->b_mvs[3].y;
-		b_dy = (sum == 0 ? 0 : SIGN(sum) * (roundtab[ABS(sum) % 16] + (ABS(sum) / 16) * 2));
+			sum = pMB->b_mvs[0].y/2 + pMB->b_mvs[1].y/2 + pMB->b_mvs[2].y/2 + pMB->b_mvs[3].y/2;
+			b_dy = (sum >> 3) + roundtab_76[sum & 0xf];
+			sum = pMB->b_mvs[0].x/2 + pMB->b_mvs[1].x/2 + pMB->b_mvs[2].x/2 + pMB->b_mvs[3].x/2;
+			b_dx = (sum >> 3) + roundtab_76[sum & 0xf];
 
+		} else {
+			sum = pMB->mvs[0].x + pMB->mvs[1].x + pMB->mvs[2].x + pMB->mvs[3].x;
+			dx = (sum == 0 ? 0 : SIGN(sum) * (roundtab[ABS(sum) % 16] + (ABS(sum) / 16) * 2));
+			sum = pMB->mvs[0].y + pMB->mvs[1].y + pMB->mvs[2].y + pMB->mvs[3].y;
+			dy = (sum == 0 ? 0 : SIGN(sum) * (roundtab[ABS(sum) % 16] + (ABS(sum) / 16) * 2));
+
+			sum = pMB->b_mvs[0].x + pMB->b_mvs[1].x + pMB->b_mvs[2].x + pMB->b_mvs[3].x;
+			b_dx = (sum == 0 ? 0 : SIGN(sum) * (roundtab[ABS(sum) % 16] + (ABS(sum) / 16) * 2));
+			sum = pMB->b_mvs[0].y + pMB->b_mvs[1].y + pMB->b_mvs[2].y + pMB->b_mvs[3].y;
+			b_dy = (sum == 0 ? 0 : SIGN(sum) * (roundtab[ABS(sum) % 16] + (ABS(sum) / 16) * 2));
+		}
 		sum = sad8bi(pCur->u + 8*x + 8*y*(Data->iEdgedWidth/2),
 					f_Ref->u + (y*8 + dy/2) * (Data->iEdgedWidth/2) + x*8 + dx/2,
 					b_Ref->u + (y*8 + b_dy/2) * (Data->iEdgedWidth/2) + x*8 + b_dx/2,
@@ -1472,7 +1453,7 @@
 
 	(*MainSearchPtr)(0, 0, Data, 255);
 
-	HalfpelRefine(Data);
+	SubpelRefine(Data);
 
 	*Data->iMinSAD +=  1 * Data->lambda16; // one bit is needed to code direct mode
 	*best_sad = *Data->iMinSAD;
@@ -1485,16 +1466,25 @@
 
 	for (k = 0; k < 4; k++) {
 		pMB->mvs[k].x = Data->directmvF[k].x + Data->currentMV->x;
-		pMB->b_mvs[k].x = ((Data->currentMV->x == 0)
+		pMB->b_mvs[k].x = (	(Data->currentMV->x == 0)
 							? Data->directmvB[k].x
-							: pMB->mvs[k].x - Data->referencemv[k].x);
+							:pMB->mvs[k].x - Data->referencemv[k].x);
 		pMB->mvs[k].y = (Data->directmvF[k].y + Data->currentMV->y);
 		pMB->b_mvs[k].y = ((Data->currentMV->y == 0)
 							? Data->directmvB[k].y
 							: pMB->mvs[k].y - Data->referencemv[k].y);
+		if (Data->qpel) {
+			pMB->qmvs[k].x = pMB->mvs[k].x; pMB->mvs[k].x /= 2;
+			pMB->b_qmvs[k].x = pMB->b_mvs[k].x; pMB->b_mvs[k].x /= 2;
+			pMB->qmvs[k].y = pMB->mvs[k].y; pMB->mvs[k].y /= 2;
+			pMB->b_qmvs[k].y = pMB->b_mvs[k].y; pMB->b_mvs[k].y /= 2;
+		}
+
 		if (b_mb->mode != MODE_INTER4V) {
 			pMB->mvs[3] = pMB->mvs[2] = pMB->mvs[1] = pMB->mvs[0];
 			pMB->b_mvs[3] = pMB->b_mvs[2] = pMB->b_mvs[1] = pMB->b_mvs[0];
+			pMB->qmvs[3] = pMB->qmvs[2] = pMB->qmvs[1] = pMB->qmvs[0];
+			pMB->b_qmvs[3] = pMB->b_qmvs[2] = pMB->b_qmvs[1] = pMB->b_qmvs[0];
 			break;
 		}
 	}
@@ -1526,14 +1516,13 @@
 {
 
 	const int32_t iEdgedWidth = pParam->edged_width;
-
 	int iDirection, i, j;
 	SearchData bData;
 
 	*(bData.iMinSAD = fData->iMinSAD) = 4096*256;
 	bData.Cur = fData->Cur;
 	fData->iEdgedWidth = bData.iEdgedWidth = iEdgedWidth;
-	bData.currentMV = fData->currentMV + 1;
+	bData.currentMV = fData->currentMV + 1; bData.currentQMV = fData->currentQMV + 1;
 	bData.lambda16 = fData->lambda16;
 	fData->iFcode = bData.bFcode = fcode; fData->bFcode = bData.iFcode = bcode;
 
@@ -1545,11 +1534,14 @@
 	bData.RefH = fData->bRefH = b_RefH + (x + y * iEdgedWidth) * 16;
 	bData.RefV = fData->bRefV = b_RefV + (x + y * iEdgedWidth) * 16;
 	bData.RefHV = fData->bRefHV = b_RefHV + (x + y * iEdgedWidth) * 16;
+	bData.RefQ = fData->RefQ;
+	fData->qpel_precision = bData.qpel_precision = 0;
+	bData.rounding = 0;
 
 	bData.bpredMV = fData->predMV = *f_predMV;
 	fData->bpredMV = bData.predMV = *b_predMV;
 
-	fData->currentMV[0] = fData->currentMV[3]; //forward search stored it's vector here. backward stored it in the place it's needed
+	fData->currentMV[0] = fData->currentMV[2];
 	get_range(&fData->min_dx, &fData->max_dx, &fData->min_dy, &fData->max_dy, x, y, 16, pParam->width, pParam->height, fcode, pParam->m_quarterpel);
 	get_range(&bData.min_dx, &bData.max_dx, &bData.min_dy, &bData.max_dy, x, y, 16, pParam->width, pParam->height, bcode, pParam->m_quarterpel);
 
@@ -1580,7 +1572,6 @@
 		// backward MV moves
 		i = fData->currentMV[1].x; j = fData->currentMV[1].y;
 		fData->currentMV[2] = fData->currentMV[0];
-
 		CheckCandidateInt(i + 1, j, 0, &iDirection, &bData);
 		CheckCandidateInt(i, j + 1, 0, &iDirection, &bData);
 		CheckCandidateInt(i - 1, j, 0, &iDirection, &bData);
@@ -1590,16 +1581,37 @@
 
 	*fData->iMinSAD +=  2 * fData->lambda16; // two bits are needed to code interpolate mode.
 
+	if (fData->qpel) {
+		fData->qpel_precision = bData.qpel_precision = 1;
+		get_range(&fData->min_dx, &fData->max_dx, &fData->min_dy, &fData->max_dy, x, y, 16, pParam->width, pParam->height, fcode, 0);
+		get_range(&bData.min_dx, &bData.max_dx, &bData.min_dy, &bData.max_dy, x, y, 16, pParam->width, pParam->height, bcode, 0);
+		fData->currentQMV[2].x = fData->currentQMV[0].x = 2 * fData->currentMV[0].x;
+		fData->currentQMV[2].y = fData->currentQMV[0].y = 2 * fData->currentMV[0].y;
+		fData->currentQMV[1].x = 2 * fData->currentMV[1].x;
+		fData->currentQMV[1].y = 2 * fData->currentMV[1].y;
+		SubpelRefine(fData);
+		fData->currentQMV[2] = fData->currentQMV[0];
+		SubpelRefine(&bData);
+	}
+
 	if (*fData->iMinSAD < *best_sad) {
 		*best_sad = *fData->iMinSAD;
 		pMB->mvs[0] = fData->currentMV[0];
 		pMB->b_mvs[0] = fData->currentMV[1];
 		pMB->mode = MODE_INTERPOLATE;
-
-		pMB->pmvs[1].x = pMB->mvs[0].x - f_predMV->x;
-		pMB->pmvs[1].y = pMB->mvs[0].y - f_predMV->y;
-		pMB->pmvs[0].x = pMB->b_mvs[0].x - b_predMV->x;
-		pMB->pmvs[0].y = pMB->b_mvs[0].y - b_predMV->y;
+		if (fData->qpel) {
+			pMB->qmvs[0] = fData->currentQMV[0];
+			pMB->b_qmvs[0] = fData->currentQMV[1];
+			pMB->pmvs[1].x = pMB->qmvs[0].x - f_predMV->x;
+			pMB->pmvs[1].y = pMB->qmvs[0].y - f_predMV->y;
+			pMB->pmvs[0].x = pMB->b_qmvs[0].x - b_predMV->x;
+			pMB->pmvs[0].y = pMB->b_qmvs[0].y - b_predMV->y;
+		} else {
+			pMB->pmvs[1].x = pMB->mvs[0].x - f_predMV->x;
+			pMB->pmvs[1].y = pMB->mvs[0].y - f_predMV->y;
+			pMB->pmvs[0].x = pMB->b_mvs[0].x - b_predMV->x;
+			pMB->pmvs[0].y = pMB->b_mvs[0].y - b_predMV->y;
+		}
 	}
 }
 
@@ -1631,19 +1643,28 @@
 
 	const int32_t TRB = time_pp - time_bp;
 	const int32_t TRD = time_pp;
+	uint8_t * qimage;
 
 // some pre-inintialized data for the rest of the search
 
 	SearchData Data;
 	int32_t iMinSAD;
 	VECTOR currentMV[3];
+	VECTOR currentQMV[3];
 	Data.iEdgedWidth = pParam->edged_width;
-	Data.currentMV = currentMV;
+	Data.currentMV = currentMV; Data.currentQMV = currentQMV;
 	Data.iMinSAD = &iMinSAD;
 	Data.lambda16 = lambda_vec16[frame->quant];
+	Data.qpel = pParam->m_quarterpel;
+	Data.rounding = 0;
 
-	// note: i==horizontal, j==vertical
+	if((qimage = (uint8_t *) malloc(32 * pParam->edged_width)) == NULL)
+		return; // allocate some mem for qpel interpolated blocks
+				  // somehow this is dirty since I think we shouldn't use malloc outside
+				  // encoder_create() - so please fix me!
+	Data.RefQ = qimage;
 
+	// note: i==horizontal, j==vertical
 	for (j = 0; j < pParam->mb_height; j++) {
 
 		f_predMV = b_predMV = zeroMV;	/* prediction is reset at left boundary */
@@ -1676,7 +1697,7 @@
 									&Data);
 
 			if (pMB->mode == MODE_DIRECT_NONE_MV) { n_count++; continue; }
-
+			
 			// forward search
 			SearchBF(f_ref->y, f_refH->y, f_refV->y, f_refHV->y,
 						&frame->image, i, j,
@@ -1709,16 +1730,23 @@
 			switch (pMB->mode) {
 				case MODE_FORWARD:
 					f_count++;
-					f_predMV = pMB->mvs[0];
+					if (pParam->m_quarterpel) f_predMV = pMB->qmvs[0];
+					else f_predMV = pMB->mvs[0];
 					break;
 				case MODE_BACKWARD:
 					b_count++;
-					b_predMV = pMB->b_mvs[0];
+					if (pParam->m_quarterpel) b_predMV = pMB->b_qmvs[0];
+					else b_predMV = pMB->b_mvs[0];
 					break;
 				case MODE_INTERPOLATE:
 					i_count++;
-					f_predMV = pMB->mvs[0];
-					b_predMV = pMB->b_mvs[0];
+					if (pParam->m_quarterpel) {
+						f_predMV = pMB->qmvs[0];
+						b_predMV = pMB->b_qmvs[0];
+					} else { 
+						f_predMV = pMB->mvs[0];
+						b_predMV = pMB->b_mvs[0];
+					}
 					break;
 				case MODE_DIRECT:
 				case MODE_DIRECT_NO4V:
@@ -1729,6 +1757,7 @@
 			}
 		}
 	}
+	free(qimage);
 }
 
 /* Hinted ME starts here */
@@ -1753,8 +1782,6 @@
 	int i, t;
 	MainSearchFunc * MainSearchPtr;
 
-	Data->predMV = get_pmv2(pMBs, pParam->mb_width, 0, x, y, 0);
-	Data->predQMV = get_qpmv2(pMBs, pParam->mb_width, 0, x, y, 0);
 	get_range(&Data->min_dx, &Data->max_dx, &Data->min_dy, &Data->max_dy, x, y, 16,
 				pParam->width, pParam->height, Data->iFcode, pParam->m_quarterpel);
 
@@ -1768,6 +1795,7 @@
 	Data->RefHV = pRefHV + (x + Data->iEdgedWidth*y) * 16;
 	Data->RefCV = pRef->v + (x + y * (Data->iEdgedWidth/2)) * 8;
 	Data->RefCU = pRef->u + (x + y * (Data->iEdgedWidth/2)) * 8;
+	Data->qpel_precision = 0;
 
 	if (!(MotionFlags & PMV_HALFPEL16)) {
 		Data->min_dx = EVEN(Data->min_dx);
@@ -1775,12 +1803,14 @@
 		Data->min_dy = EVEN(Data->min_dy);
 		Data->max_dy = EVEN(Data->max_dy); 
 	}
+	if (pParam->m_quarterpel) Data->predMV = get_qpmv2(pMBs, pParam->mb_width, 0, x, y, 0);
+	else Data->predMV = get_pmv2(pMBs, pParam->mb_width, 0, x, y, 0);
 
 	for(i = 0; i < 5; i++) Data->iMinSAD[i] = MV_MAX_ERROR;
 
 	if (pMB->dquant != NO_CHANGE) inter4v = 0;
 
-	if (inter4v || pParam->m_quarterpel || Data->chroma) CheckCandidate = CheckCandidate16;
+	if (inter4v || Data->chroma) CheckCandidate = CheckCandidate16;
 	else CheckCandidate = CheckCandidate16no4v;
 
 	pMB->mvs[0].x = EVEN(pMB->mvs[0].x);
@@ -1808,7 +1838,7 @@
 
 	(*MainSearchPtr)(Data->currentMV->x, Data->currentMV->y, Data, 255);
 
-	if (MotionFlags & PMV_HALFPELREFINE16) HalfpelRefine(Data);
+	if (MotionFlags & PMV_HALFPELREFINE16) SubpelRefine(Data);
 
 	for(i = 0; i < 5; i++) {
 		Data->currentQMV[i].x = 2 * Data->currentMV[i].x; // initialize qpel vectors
@@ -1818,8 +1848,8 @@
 	if((pParam->m_quarterpel) && (MotionFlags & PMV_QUARTERPELREFINE16)) {
 		get_range(&Data->min_dx, &Data->max_dx, &Data->min_dy, &Data->max_dy, x, y, 16,
 				pParam->width, pParam->height, Data->iFcode, 0);
-		CheckCandidate = CheckCandidate16_qpel;
-		QuarterpelRefine(Data);
+		Data->qpel_precision = 1;
+		SubpelRefine(Data);
 	}
 
 	if (inter4v) {
@@ -1835,19 +1865,18 @@
 		Search8(Data, 2*x + 1, 2*y + 1, MotionFlags, pParam, pMB, pMBs, 3, &Data8);
 
 		if (Data->chroma) {
-			int sum, dx, dy;
+			int sumx, sumy, dx, dy;
 
-			if(pParam->m_quarterpel)
-				sum = (pMB->qmvs[0].y + pMB->qmvs[1].y + pMB->qmvs[2].y + pMB->qmvs[3].y)/2;
-			else sum = pMB->mvs[0].y + pMB->mvs[1].y + pMB->mvs[2].y + pMB->mvs[3].y;
-			dy = (sum ? SIGN(sum) *
-				  (roundtab[ABS(sum) % 16] + (ABS(sum) / 16) * 2) : 0);
-
-			if(pParam->m_quarterpel)
-				sum = (pMB->qmvs[0].x + pMB->qmvs[1].x + pMB->qmvs[2].x + pMB->qmvs[3].x)/2;
-			else sum = pMB->mvs[0].x + pMB->mvs[1].x + pMB->mvs[2].x + pMB->mvs[3].x;
-			dx = (sum ? SIGN(sum) *
-				  (roundtab[ABS(sum) % 16] + (ABS(sum) / 16) * 2) : 0);
+			if(pParam->m_quarterpel) {
+				sumx= pMB->qmvs[0].x/2 + pMB->qmvs[1].x/2 + pMB->qmvs[2].x/2 + pMB->qmvs[3].x/2;
+				sumy = pMB->qmvs[0].y/2 + pMB->qmvs[1].y/2 + pMB->qmvs[2].y/2 + pMB->qmvs[3].y/2;
+			} else {
+				sumx = pMB->mvs[0].x + pMB->mvs[1].x + pMB->mvs[2].x + pMB->mvs[3].x;
+				sumy = pMB->mvs[0].y + pMB->mvs[1].y + pMB->mvs[2].y + pMB->mvs[3].y;
+			}
+			dx = (sumx >> 3) + roundtab_76[sumx & 0xf];
+			dy = (sumy >> 3) + roundtab_76[sumy & 0xf];
+			
 			Data->iMinSAD[1] += ChromaSAD(dx, dy, Data);
 		}
 	}
@@ -1867,8 +1896,8 @@
 			pMB->sad8[2] = pMB->sad8[3] =  Data->iMinSAD[0];
 
 		if(pParam->m_quarterpel) {
-			pMB->pmvs[0].x = Data->currentQMV[0].x - Data->predQMV.x;
-			pMB->pmvs[0].y = Data->currentQMV[0].y - Data->predQMV.y;
+			pMB->pmvs[0].x = Data->currentQMV[0].x - Data->predMV.x;
+			pMB->pmvs[0].y = Data->currentQMV[0].y - Data->predMV.y;
 		} else {
 			pMB->pmvs[0].x = Data->currentMV[0].x - Data->predMV.x;
 			pMB->pmvs[0].y = Data->currentMV[0].y - Data->predMV.y;
@@ -2008,7 +2037,7 @@
 }
 
 #define INTRA_THRESH	1350
-#define INTER_THRESH	900
+#define INTER_THRESH	1200
 
 
 int
@@ -2034,14 +2063,14 @@
 	Data.iFcode = Current->fcode;
 	CheckCandidate = CheckCandidate16no4vI;
 
-	if (intraCount < 12) // we're right after an I frame
-		IntraThresh += 4 * (intraCount - 12) * (intraCount - 12);
+	if (intraCount < 10) // we're right after an I frame
+		IntraThresh += 4 * (intraCount - 10) * (intraCount - 10);
 	else
 		if ( 5*(maxIntra - intraCount) < maxIntra) // we're close to maximum. 2 sec when max is 10 sec
 			IntraThresh -= (IntraThresh * (maxIntra - 5*(maxIntra - intraCount)))/maxIntra;
 
 
-	InterThresh += 300 * (1 - bCount);
+	InterThresh += 400 * (1 - bCount);
 	if (InterThresh < 200) InterThresh = 200;
 
 	if (sadInit) (*sadInit) ();