Index: display/simgraph16.cc =================================================================== --- display/simgraph16.cc (revision 8112) +++ display/simgraph16.cc (working copy) @@ -7,6 +7,7 @@ #include #include #include +#include #include "../macros.h" #include "../simtypes.h" @@ -32,24 +33,6 @@ # include #endif -// first: find out, which copy routines we may use! -#ifndef __GNUC__ -# undef USE_C -# define USE_C -# ifndef _M_IX86 -# define ALIGN_COPY -# endif -#else -# if defined(USE_C) || !defined(__i386__) -# undef USE_C -# define USE_C -# if GCC_ATLEAST(4, 2) || !defined(__i386__) -# define ALIGN_COPY -# warning "Needs to use slower copy with GCC > 4.2.x" -# endif -# endif -#endif - #ifdef MULTI_THREAD #include "../utils/simthread.h" @@ -2357,12 +2340,7 @@ PIXVAL *tp = textur + xp + yp * disp_width; do { // line decoder -#ifdef USE_C uint16 runlen = *sp++; -#else - // assembler needs this size - uint32 runlen = *sp++; -#endif PIXVAL *p = tp; // one line decoder @@ -2379,73 +2357,40 @@ sp += runlen; } else { -#ifdef USE_C -#ifndef ALIGN_COPY - { - // "classic" C code (why is it faster!?!) - const uint32 *ls; - uint32 *ld; +#if defined LOW_LEVEL + // low level c++ - if (runlen & 1) { - *p++ = *sp++; + if( reinterpret_cast(p) & 0x1 ) { + return; } - ls = (const uint32 *)sp; - ld = (uint32 *)p; - runlen >>= 1; - while (runlen--) { - *ld++ = *ls++; + if( runlen ) { + // align to 4 bytes, should use uintptr_t but not available + if( reinterpret_cast(p) & 0x2 ) { + *p++ = *sp++; + runlen--; + } + // aligned fast copy loop + bool const postalign = runlen & 1; + runlen >>= 1; + uint32 *ld = (uint32 *)p; + const uint32 *ls = (const uint32 *)sp; + while (runlen--) { + *ld++ = *ls++; + } + p = (PIXVAL*)ld; + sp = (const PIXVAL*)ls; + // finish unaligned remainder + if( postalign ) { + *p++ = *sp++; + } } - p = (PIXVAL*)ld; - sp = (const PIXVAL*)ls; - } #else - // some architectures: faster with inline of memory functions! - memcpy( p, sp, runlen*sizeof(PIXVAL) ); - sp += runlen; - p += runlen; + // high level c++ + const PIXVAL *const splast = sp + runlen; + p = std::copy(sp, splast, p); + sp = splast; #endif -#else - // this code is sometimes slower, mostly 5% faster, not really clear why and when (cache alignment?) - asm volatile ( - // rep movsw and we would be finished, but we unroll - // uneven number of words to copy - "shrl %2\n\t" - "jnc 0f\n\t" - // Copy first word - // *p++ = *sp++; - "movsw\n\t" - "0:\n\t" - "negl %2\n\t" - "addl $1f, %2\n\t" - "jmp * %2\n\t" - "ud2\n\t" - ".p2align 4\n\t" -#define MOVSD1 "movsd\n\t" -#define MOVSD2 MOVSD1 MOVSD1 -#define MOVSD4 MOVSD2 MOVSD2 -#define MOVSD8 MOVSD4 MOVSD4 -#define MOVSD16 MOVSD8 MOVSD8 -#define MOVSD32 MOVSD16 MOVSD16 -#define MOVSD64 MOVSD32 MOVSD32 -#define MOVSD128 MOVSD64 MOVSD64 -#define MOVSD256 MOVSD128 MOVSD128 - MOVSD256 -#undef MOVSD256 -#undef MOVSD128 -#undef MOVSD64 -#undef MOVSD32 -#undef MOVSD16 -#undef MOVSD8 -#undef MOVSD4 -#undef MOVSD2 -#undef MOVSD1 - "1:\n\t" - : "+D" (p), "+S" (sp), "+r" (runlen) - : - : "cc", "memory" - ); -#endif } runlen = *sp++; } while (runlen != 0); @@ -3820,20 +3765,11 @@ // scrolls horizontally, will ignore clipping etc. void display_scroll_band(const KOORD_VAL start_y, const KOORD_VAL x_offset, const KOORD_VAL h) { - const PIXVAL* src = textur + start_y * disp_width + x_offset; - PIXVAL* dst = textur + start_y * disp_width; - size_t amount = sizeof(PIXVAL) * (h * disp_width - x_offset); + const PIXVAL*const src = textur + start_y * disp_width + x_offset; + PIXVAL *const dst = textur + start_y * disp_width; + const size_t amount = sizeof(PIXVAL) * (h * disp_width - x_offset); -#ifdef USE_C memmove(dst, src, amount); -#else - amount /= 4; - asm volatile ( - "rep\n\t" - "movsl\n\t" - : "+D" (dst), "+S" (src), "+c" (amount) - ); -#endif } @@ -3870,9 +3806,6 @@ if (clip_lr(&xp, &w, cL, cR) && clip_lr(&yp, &h, cT, cB)) { PIXVAL *p = textur + xp + yp * disp_width; int dx = disp_width - w; -#if !defined( USE_C ) || !defined( ALIGN_COPY ) - const uint32 longcolval = (colval << 16) | colval; -#endif if (dirty) { mark_rect_dirty_nc(xp, yp, xp + w - 1, yp + h - 1); @@ -3879,42 +3812,33 @@ } do { -#ifdef USE_C +#if defined LOW_LEVEL + // low level c++ + const uint32 colvald = (colval << 16) | colval; KOORD_VAL count = w; -#ifdef ALIGN_COPY - // unfortunately the GCC > 4.1.x has a bug in the optimizer - while( count-- != 0 ) { - *p++ = colval; - } -#else - uint32 *lp; - if( count & 1 ) { - *p++ = colval; + // align to 4 bytes, should use uintptr_t but not available + if( reinterpret_cast(p) & 0x2 ) { + *p++ = (PIXVAL)colvald; + count--; } + // aligned fast fill loop + bool const postalign = count & 1; count >>= 1; - lp = (uint32 *)p; - while( count-- != 0 ) { - *lp++ = longcolval; + uint32 *lp = (uint32 *)p; + while(count--) { + *lp++ = colvald; } p = (PIXVAL *)lp; -#endif + // finish unaligned remainder + if( postalign ) { + *p++ = (PIXVAL)colvald; + } #else - unsigned int count = w; - asm volatile ( - // uneven words to copy? - "shrl %1\n\t" - "jnc 0f\n\t" - // set first word - "stosw\n\t" - "0:\n\t" - // now we set long words ... - "rep\n\t" - "stosl" - : "+D" (p), "+c" (count) - : "a" (longcolval) - : "cc", "memory" - ); + // high level c++ + PIXVAL *const fillend = p + w; + std::fill(p, fillend, colval); + p = fillend; #endif p += dx; } while (--h != 0); @@ -4273,10 +4197,6 @@ KOORD_VAL x0; // store the initial x (for dirty marking) KOORD_VAL y_offset, char_height; // real y for display with clipping unsigned char mask1, mask2; // for horizontal clipping -#ifndef USE_C - // faster drawing with assembler - const uint32 color2 = (color << 16) | color; -#endif // TAKE CARE: Clipping area may be larger than actual screen size ... if( (flags & DT_CLIP) ) { @@ -4371,7 +4291,8 @@ for (h = char_yoffset; h < char_height; h++) { unsigned int dat = *p++ & m; PIXVAL* dst = textur + screen_pos; -#ifdef USE_C +#if defined LOW_LEVEL + // low level c++ if (dat != 0) { if (dat & 0x80) dst[0] = color; if (dat & 0x40) dst[1] = color; @@ -4383,9 +4304,14 @@ if (dat & 0x01) dst[7] = color; } #else - // assemble variant of the above, using table and string instructions: - // optimized for long pipelines ... -# include "text_pixel.c" + // high level c++ + if( dat != 0 ) { + for( size_t dat_offset = 0 ; dat_offset < 8 ; dat_offset++ ) { + if( (dat & (0x80 >> dat_offset)) ) { + dst[dat_offset] = color; + } + } + } #endif screen_pos += disp_width; } Index: Makefile =================================================================== --- Makefile (revision 8112) +++ Makefile (working copy) @@ -19,7 +19,7 @@ ifeq ($(OSTYPE),amiga) STD_LIBS ?= -lunix -lSDL_mixer -lsmpeg -lvorbisfile -lvorbis -logg - CFLAGS += -mcrt=newlib -DUSE_C -DSIM_BIG_ENDIAN -gstabs+ + CFLAGS += -mcrt=newlib -DSIM_BIG_ENDIAN -gstabs+ LDFLAGS += -Bstatic -non_shared else # BeOS (obsolete) Index: simconst.h =================================================================== --- simconst.h (revision 8112) +++ simconst.h (working copy) @@ -32,9 +32,9 @@ /* need to emulate the mouse pointer with a graphic */ //#define USE_SOFTPOINTER -/* Use C implementation of image drawing routines - * needed i.e. for MSVC and PowerPC */ -//#define USE_C +/* Use low level C/C++ implementations of routines + * Some routines, eg for drawing, can have low level C++ implementations that might perform better on certain platforms */ +//#define LOW_LEVEL // The wind (i.e. approach direction) is random all over the map (not recommended, since it confuses players) //#define USE_DIFFERENT_WIND @@ -92,10 +92,4 @@ #define Z_PLAN (4) #define Z_GRID (0) - -// sanity check: USE_C if not GCC and not intel 32bit -#if !defined USE_C && (!defined __GNUC__ || !defined __i386__) -# define USE_C #endif - -#endif