From 8f297be26fa7d0fdbdb4fe2dfefe10ecb13fc742 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 30 May 2026 17:05:47 +0000
Subject: [PATCH] Vectorise bulk uint16 reads in the descriptor reader

Add node_body_t::read_uint16_block: bounds-check the whole block once, then
reconstruct each uint16 from its two little-endian bytes in a flat loop the
compiler vectorises (a copy on little-endian hosts, a byte-swap on
big-endian).  The image reader's per-pixel decode_uint16 loop, whose
per-element bounds check blocked vectorisation, was a large part of colour
pakset load time; it now calls read_uint16_block.  Bit-identical to the old
loop; version-0 images keep their per-pixel player-colour fixup.
---
 src/simutrans/descriptor/reader/image_reader.cc | 10 ++--------
 src/simutrans/descriptor/reader/obj_reader.h    | 10 ++++++++++
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/simutrans/descriptor/reader/image_reader.cc b/src/simutrans/descriptor/reader/image_reader.cc
index bfd22da6f..fab9b68f2 100644
--- a/src/simutrans/descriptor/reader/image_reader.cc
+++ b/src/simutrans/descriptor/reader/image_reader.cc
@@ -81,11 +81,8 @@ obj_desc_t *image_reader_t::read_node(FILE *fp, obj_node_info_t &node)
 		desc->imageid = IMG_EMPTY;
 
 		skip_reading_pixels_if_no_graphics;
-		uint16* dest = desc->data;
 		if (desc->h > 0) {
-			for (uint i = 0; i < desc->len; i++) {
-				*dest++ = decode_uint16(p);
-			}
+			p.read_uint16_block(desc->data, desc->len);
 		}
 	}
 	else if(version==3) {
@@ -99,11 +96,8 @@ obj_desc_t *image_reader_t::read_node(FILE *fp, obj_node_info_t &node)
 		desc->imageid = IMG_EMPTY;
 
 		skip_reading_pixels_if_no_graphics;
-		uint16* dest = desc->data;
 		if (desc->h > 0) {
-			for (uint i = 0; i < desc->len; i++) {
-				*dest++ = decode_uint16(p);
-			}
+			p.read_uint16_block(desc->data, desc->len);
 		}
 	}
 	else {
diff --git a/src/simutrans/descriptor/reader/obj_reader.h b/src/simutrans/descriptor/reader/obj_reader.h
index 050723fb5..ba7aec6e0 100644
--- a/src/simutrans/descriptor/reader/obj_reader.h
+++ b/src/simutrans/descriptor/reader/obj_reader.h
@@ -182,6 +182,16 @@ public:
 		return complain(2);
 	}
 
+	/// Bulk little-endian uint16 read: one bounds check then a flat loop, so
+	/// the compiler vectorises it (a per-element read_uint16 loop won't).
+	void read_uint16_block(uint16* dest, size_t count)
+	{
+		const uint8* src = (const uint8*)read_bytes(count * 2);
+		for (size_t i = 0; i < count; i++) {
+			dest[i] = (uint16)(src[2 * i] | (src[2 * i + 1] << 8));
+		}
+	}
+
 	inline uint32 read_uint32()
 	{
 		if (ptr + 3 < end) {
-- 
2.54.0

