Index: display/simgraph16.cc
===================================================================
--- display/simgraph16.cc	(revision 8112)
+++ display/simgraph16.cc	(working copy)
@@ -7,6 +7,7 @@
 #include <string.h>
 #include <stdio.h>
 #include <math.h>
+#include <algorithm>
 
 #include "../macros.h"
 #include "../simtypes.h"
@@ -32,24 +33,6 @@
 #	include <unistd.h>
 #endif
 
-// first: find out, which copy routines we may use!
-#ifndef  __GNUC__
-# undef USE_C
-# define USE_C
-# ifndef  _M_IX86
-#  define ALIGN_COPY
-# endif
-#else
-# if defined(USE_C)  ||  !defined(__i386__)
-#  undef USE_C
-#  define USE_C
-#  if GCC_ATLEAST(4, 2) || !defined(__i386__)
-#   define ALIGN_COPY
-#   warning "Needs to use slower copy with GCC > 4.2.x"
-#  endif
-# endif
-#endif
-
 #ifdef MULTI_THREAD
 #include "../utils/simthread.h"
 
@@ -2357,12 +2340,7 @@
 		PIXVAL *tp = textur + xp + yp * disp_width;
 
 		do { // line decoder
-#ifdef USE_C
 			uint16 runlen = *sp++;
-#else
-			// assembler needs this size
-			uint32 runlen = *sp++;
-#endif
 			PIXVAL *p = tp;
 
 			// one line decoder
@@ -2379,73 +2357,40 @@
 					sp += runlen;
 				}
 				else {
-#ifdef USE_C
-#ifndef ALIGN_COPY
-				{
-					// "classic" C code (why is it faster!?!)
-					const uint32 *ls;
-					uint32 *ld;
+#if defined LOW_LEVEL
+					// low level c++
 
-					if (runlen & 1) {
-						*p++ = *sp++;
+					if(  reinterpret_cast<size_t>(p) & 0x1  ) {
+						return;
 					}
 
-					ls = (const uint32 *)sp;
-					ld = (uint32 *)p;
-					runlen >>= 1;
-					while (runlen--) {
-						*ld++ = *ls++;
+					if(  runlen  ) {
+						// align to 4 bytes, should use uintptr_t but not available
+						if(  reinterpret_cast<size_t>(p) & 0x2  ) {
+							*p++ = *sp++;
+							runlen--;
+						}
+						// aligned fast copy loop
+						bool const postalign = runlen & 1;
+						runlen >>= 1;
+						uint32 *ld = (uint32 *)p;
+						const uint32 *ls = (const uint32 *)sp;
+						while (runlen--) {
+							*ld++ = *ls++;
+						}
+						p = (PIXVAL*)ld;
+						sp = (const PIXVAL*)ls;
+						// finish unaligned remainder
+						if(  postalign  ) {
+							*p++ = *sp++;
+						}
 					}
-					p = (PIXVAL*)ld;
-					sp = (const PIXVAL*)ls;
-				}
 #else
-				// some architectures: faster with inline of memory functions!
-				memcpy( p, sp, runlen*sizeof(PIXVAL) );
-				sp += runlen;
-				p += runlen;
+					// high level c++
+					const PIXVAL *const splast = sp + runlen;
+					p = std::copy(sp, splast, p);
+					sp = splast;
 #endif
-#else
-				// this code is sometimes slower, mostly 5% faster, not really clear why and when (cache alignment?)
-				asm volatile (
-					// rep movsw and we would be finished, but we unroll
-					// uneven number of words to copy
-					"shrl %2\n\t"
-					"jnc 0f\n\t"
-					// Copy first word
-					// *p++ = *sp++;
-					"movsw\n\t"
-					"0:\n\t"
-					"negl %2\n\t"
-					"addl $1f, %2\n\t"
-					"jmp * %2\n\t"
-					"ud2\n\t"
-					".p2align 4\n\t"
-#define MOVSD1   "movsd\n\t"
-#define MOVSD2   MOVSD1   MOVSD1
-#define MOVSD4   MOVSD2   MOVSD2
-#define MOVSD8   MOVSD4   MOVSD4
-#define MOVSD16  MOVSD8   MOVSD8
-#define MOVSD32  MOVSD16  MOVSD16
-#define MOVSD64  MOVSD32  MOVSD32
-#define MOVSD128 MOVSD64  MOVSD64
-#define MOVSD256 MOVSD128 MOVSD128
-					MOVSD256
-#undef MOVSD256
-#undef MOVSD128
-#undef MOVSD64
-#undef MOVSD32
-#undef MOVSD16
-#undef MOVSD8
-#undef MOVSD4
-#undef MOVSD2
-#undef MOVSD1
-					"1:\n\t"
-					: "+D" (p), "+S" (sp), "+r" (runlen)
-					:
-					: "cc", "memory"
-				);
-#endif
 				}
 				runlen = *sp++;
 			} while (runlen != 0);
@@ -3820,20 +3765,11 @@
 // scrolls horizontally, will ignore clipping etc.
 void display_scroll_band(const KOORD_VAL start_y, const KOORD_VAL x_offset, const KOORD_VAL h)
 {
-	const PIXVAL* src = textur + start_y * disp_width + x_offset;
-	PIXVAL* dst = textur + start_y * disp_width;
-	size_t amount = sizeof(PIXVAL) * (h * disp_width - x_offset);
+	const PIXVAL*const src = textur + start_y * disp_width + x_offset;
+	PIXVAL *const dst = textur + start_y * disp_width;
+	const size_t amount = sizeof(PIXVAL) * (h * disp_width - x_offset);
 
-#ifdef USE_C
 	memmove(dst, src, amount);
-#else
-	amount /= 4;
-	asm volatile (
-		"rep\n\t"
-		"movsl\n\t"
-		: "+D" (dst), "+S" (src), "+c" (amount)
-	);
-#endif
 }
 
 
@@ -3870,9 +3806,6 @@
 	if (clip_lr(&xp, &w, cL, cR) && clip_lr(&yp, &h, cT, cB)) {
 		PIXVAL *p = textur + xp + yp * disp_width;
 		int dx = disp_width - w;
-#if !defined( USE_C )  ||  !defined( ALIGN_COPY )
-		const uint32 longcolval = (colval << 16) | colval;
-#endif
 
 		if (dirty) {
 			mark_rect_dirty_nc(xp, yp, xp + w - 1, yp + h - 1);
@@ -3879,42 +3812,33 @@
 		}
 
 		do {
-#ifdef USE_C
+#if defined LOW_LEVEL
+			// low level c++
+			const uint32 colvald = (colval << 16) | colval;
 			KOORD_VAL count = w;
-#ifdef ALIGN_COPY
-			// unfortunately the GCC > 4.1.x has a bug in the optimizer
-			while(  count-- != 0  ) {
-				*p++ = colval;
-			}
-#else
-			uint32 *lp;
 
-			if(  count & 1  ) {
-				*p++ = colval;
+			// align to 4 bytes, should use uintptr_t but not available
+			if(  reinterpret_cast<size_t>(p) & 0x2  ) {
+				*p++ = (PIXVAL)colvald;
+				count--;
 			}
+			// aligned fast fill loop
+			bool const postalign = count & 1;
 			count >>= 1;
-			lp = (uint32 *)p;
-			while(  count-- != 0  ) {
-				*lp++ = longcolval;
+			uint32 *lp = (uint32 *)p;
+			while(count--) {
+				*lp++ = colvald;
 			}
 			p = (PIXVAL *)lp;
-#endif
+			// finish unaligned remainder
+			if(  postalign  ) {
+				*p++ = (PIXVAL)colvald;
+			}
 #else
-			unsigned int count = w;
-			asm volatile (
-				// uneven words to copy?
-				"shrl %1\n\t"
-				"jnc 0f\n\t"
-				// set first word
-				"stosw\n\t"
-				"0:\n\t"
-				// now we set long words ...
-				"rep\n\t"
-				"stosl"
-				: "+D" (p), "+c" (count)
-				: "a" (longcolval)
-				: "cc", "memory"
-			);
+			// high level c++
+			PIXVAL *const fillend = p + w;
+			std::fill(p, fillend, colval);
+			p = fillend;
 #endif
 			p += dx;
 		} while (--h != 0);
@@ -4273,10 +4197,6 @@
 	KOORD_VAL x0;	// store the initial x (for dirty marking)
 	KOORD_VAL y_offset, char_height;	// real y for display with clipping
 	unsigned char mask1, mask2;	// for horizontal clipping
-#ifndef USE_C
-	// faster drawing with assembler
-	const uint32 color2 = (color << 16) | color;
-#endif
 
 	// TAKE CARE: Clipping area may be larger than actual screen size ...
 	if(  (flags & DT_CLIP)  ) {
@@ -4371,7 +4291,8 @@
 				for (h = char_yoffset; h < char_height; h++) {
 					unsigned int dat = *p++ & m;
 					PIXVAL* dst = textur + screen_pos;
-#ifdef USE_C
+#if defined LOW_LEVEL
+					// low level c++
 					if (dat != 0) {
 						if (dat & 0x80) dst[0] = color;
 						if (dat & 0x40) dst[1] = color;
@@ -4383,9 +4304,14 @@
 						if (dat & 0x01) dst[7] = color;
 					}
 #else
-					// assemble variant of the above, using table and string instructions:
-					// optimized for long pipelines ...
-#					include "text_pixel.c"
+					// high level c++
+					if(  dat  !=  0  ) {
+						for(  size_t dat_offset = 0 ; dat_offset < 8 ; dat_offset++  ) {
+							if(  (dat & (0x80 >> dat_offset))  ) {
+								dst[dat_offset] = color;
+							}
+						}
+					}
 #endif
 					screen_pos += disp_width;
 				}
Index: Makefile
===================================================================
--- Makefile	(revision 8112)
+++ Makefile	(working copy)
@@ -19,7 +19,7 @@
 
 ifeq ($(OSTYPE),amiga)
   STD_LIBS ?= -lunix -lSDL_mixer -lsmpeg -lvorbisfile -lvorbis -logg
-  CFLAGS += -mcrt=newlib -DUSE_C -DSIM_BIG_ENDIAN -gstabs+
+  CFLAGS += -mcrt=newlib -DSIM_BIG_ENDIAN -gstabs+
   LDFLAGS += -Bstatic -non_shared
 else
 # BeOS (obsolete)
Index: simconst.h
===================================================================
--- simconst.h	(revision 8112)
+++ simconst.h	(working copy)
@@ -32,9 +32,9 @@
 /* need to emulate the mouse pointer with a graphic */
 //#define USE_SOFTPOINTER
 
-/* Use C implementation of image drawing routines
- * needed i.e. for MSVC and PowerPC */
-//#define USE_C
+/* Use low level C/C++ implementations of routines
+ * Some routines, eg for drawing, can have low level C++ implementations that might perform better on certain platforms */
+//#define LOW_LEVEL
 
 // The wind (i.e. approach direction) is random all over the map (not recommended, since it confuses players)
 //#define USE_DIFFERENT_WIND
@@ -92,10 +92,4 @@
 #define Z_PLAN (4)
 #define Z_GRID (0)
 
-
-// sanity check: USE_C if not GCC and not intel 32bit
-#if !defined USE_C && (!defined __GNUC__ || !defined __i386__)
-#	define USE_C
 #endif
-
-#endif