encoding.cpp

Character encoding detection and transcoding implementation. More…
Namespaces

Name
libvroom
Detailed Description

Character encoding detection and transcoding implementation.
Uses simdutf for SIMD-accelerated encoding conversion. Custom handling for Windows-1252 (0x80-0x9F range) which simdutf doesn’t support.
Source code


#include "libvroom/encoding.h"

#include <algorithm>
#include <cctype>
#include <cstring>
#include <simdutf.h>
#include <stdexcept>
#include <string>
#include <vector>

namespace libvroom {

const char* encoding_to_string(CharEncoding enc) {
  switch (enc) {
  case CharEncoding::UTF8:
    return "UTF-8";
  case CharEncoding::UTF8_BOM:
    return "UTF-8 (BOM)";
  case CharEncoding::UTF16_LE:
    return "UTF-16LE";
  case CharEncoding::UTF16_BE:
    return "UTF-16BE";
  case CharEncoding::UTF32_LE:
    return "UTF-32LE";
  case CharEncoding::UTF32_BE:
    return "UTF-32BE";
  case CharEncoding::LATIN1:
    return "Latin-1";
  case CharEncoding::WINDOWS_1252:
    return "Windows-1252";
  case CharEncoding::UNKNOWN:
    return "Unknown";
  }
  return "Unknown";
}

CharEncoding parse_encoding_name(std::string_view name) {
  // Normalize: lowercase, remove hyphens and underscores
  std::string normalized;
  normalized.reserve(name.size());
  for (char c : name) {
    if (c != '-' && c != '_') {
      normalized += static_cast<char>(std::tolower(static_cast<unsigned char>(c)));
    }
  }

  if (normalized == "utf8")
    return CharEncoding::UTF8;
  if (normalized == "utf16le")
    return CharEncoding::UTF16_LE;
  if (normalized == "utf16be")
    return CharEncoding::UTF16_BE;
  if (normalized == "utf32le")
    return CharEncoding::UTF32_LE;
  if (normalized == "utf32be")
    return CharEncoding::UTF32_BE;
  if (normalized == "latin1" || normalized == "iso88591")
    return CharEncoding::LATIN1;
  if (normalized == "windows1252" || normalized == "cp1252" || normalized == "win1252")
    return CharEncoding::WINDOWS_1252;

  return CharEncoding::UNKNOWN;
}

// Windows-1252 lookup table for bytes 0x80-0x9F.
// These map to Unicode code points that differ from Latin-1.
// Entries of 0 indicate undefined bytes (mapped to U+FFFD replacement char).
static const uint32_t windows1252_to_unicode[32] = {
    0x20AC, // 0x80 -> Euro sign
    0,      // 0x81 -> undefined
    0x201A, // 0x82 -> Single low-9 quotation mark
    0x0192, // 0x83 -> Latin small letter f with hook
    0x201E, // 0x84 -> Double low-9 quotation mark
    0x2026, // 0x85 -> Horizontal ellipsis
    0x2020, // 0x86 -> Dagger
    0x2021, // 0x87 -> Double dagger
    0x02C6, // 0x88 -> Modifier letter circumflex accent
    0x2030, // 0x89 -> Per mille sign
    0x0160, // 0x8A -> Latin capital letter S with caron
    0x2039, // 0x8B -> Single left-pointing angle quotation mark
    0x0152, // 0x8C -> Latin capital ligature OE
    0,      // 0x8D -> undefined
    0x017D, // 0x8E -> Latin capital letter Z with caron
    0,      // 0x8F -> undefined
    0,      // 0x90 -> undefined
    0x2018, // 0x91 -> Left single quotation mark
    0x2019, // 0x92 -> Right single quotation mark
    0x201C, // 0x93 -> Left double quotation mark
    0x201D, // 0x94 -> Right double quotation mark
    0x2022, // 0x95 -> Bullet
    0x2013, // 0x96 -> En dash
    0x2014, // 0x97 -> Em dash
    0x02DC, // 0x98 -> Small tilde
    0x2122, // 0x99 -> Trade mark sign
    0x0161, // 0x9A -> Latin small letter s with caron
    0x203A, // 0x9B -> Single right-pointing angle quotation mark
    0x0153, // 0x9C -> Latin small ligature oe
    0,      // 0x9D -> undefined
    0x017E, // 0x9E -> Latin small letter z with caron
    0x0178, // 0x9F -> Latin capital letter Y with diaeresis
};

// Encode a Unicode code point as UTF-8 bytes. Returns number of bytes written.
static size_t encode_utf8(uint32_t cp, uint8_t* out) {
  if (cp < 0x80) {
    out[0] = static_cast<uint8_t>(cp);
    return 1;
  } else if (cp < 0x800) {
    out[0] = static_cast<uint8_t>(0xC0 | (cp >> 6));
    out[1] = static_cast<uint8_t>(0x80 | (cp & 0x3F));
    return 2;
  } else if (cp < 0x10000) {
    out[0] = static_cast<uint8_t>(0xE0 | (cp >> 12));
    out[1] = static_cast<uint8_t>(0x80 | ((cp >> 6) & 0x3F));
    out[2] = static_cast<uint8_t>(0x80 | (cp & 0x3F));
    return 3;
  } else {
    out[0] = static_cast<uint8_t>(0xF0 | (cp >> 18));
    out[1] = static_cast<uint8_t>(0x80 | ((cp >> 12) & 0x3F));
    out[2] = static_cast<uint8_t>(0x80 | ((cp >> 6) & 0x3F));
    out[3] = static_cast<uint8_t>(0x80 | (cp & 0x3F));
    return 4;
  }
}

// Check if the data contains any bytes in the Windows-1252 0x80-0x9F range.
static bool has_windows1252_bytes(const uint8_t* data, size_t size) {
  for (size_t i = 0; i < size; ++i) {
    if (data[i] >= 0x80 && data[i] <= 0x9F) {
      return true;
    }
  }
  return false;
}

EncodingResult detect_encoding(const uint8_t* data, size_t size) {
  EncodingResult result;

  if (size == 0) {
    result.encoding = CharEncoding::UTF8;
    result.confidence = 1.0;
    return result;
  }

  // Check BOM (byte order mark)
  // Check UTF-32 BOMs first (4 bytes) before UTF-16 (2 bytes)
  if (size >= 4) {
    if (data[0] == 0xFF && data[1] == 0xFE && data[2] == 0x00 && data[3] == 0x00) {
      result.encoding = CharEncoding::UTF32_LE;
      result.bom_length = 4;
      result.confidence = 1.0;
      result.needs_transcoding = true;
      return result;
    }
    if (data[0] == 0x00 && data[1] == 0x00 && data[2] == 0xFE && data[3] == 0xFF) {
      result.encoding = CharEncoding::UTF32_BE;
      result.bom_length = 4;
      result.confidence = 1.0;
      result.needs_transcoding = true;
      return result;
    }
  }

  if (size >= 3) {
    if (data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF) {
      result.encoding = CharEncoding::UTF8_BOM;
      result.bom_length = 3;
      result.confidence = 1.0;
      result.needs_transcoding = false; // Just skip BOM bytes
      return result;
    }
  }

  if (size >= 2) {
    if (data[0] == 0xFF && data[1] == 0xFE) {
      result.encoding = CharEncoding::UTF16_LE;
      result.bom_length = 2;
      result.confidence = 1.0;
      result.needs_transcoding = true;
      return result;
    }
    if (data[0] == 0xFE && data[1] == 0xFF) {
      result.encoding = CharEncoding::UTF16_BE;
      result.bom_length = 2;
      result.confidence = 1.0;
      result.needs_transcoding = true;
      return result;
    }
  }

  // No BOM found — heuristic detection.
  // Check for null byte patterns first, since null bytes are valid UTF-8
  // but strongly indicate UTF-16/32 encoding in text data.
  if (size >= 4) {
    // UTF-32: every 4th byte pattern
    bool could_be_utf32_le = true;
    bool could_be_utf32_be = true;
    size_t check_bytes = std::min(size, size_t(256));
    // Ensure we check a multiple of 4 bytes
    check_bytes = (check_bytes / 4) * 4;

    if (check_bytes >= 4) {
      for (size_t i = 0; i < check_bytes; i += 4) {
        // UTF-32 LE: significant byte first, then nulls
        if (data[i + 2] != 0 || data[i + 3] != 0) {
          could_be_utf32_le = false;
        }
        // UTF-32 BE: nulls first, then significant byte
        if (data[i] != 0 || data[i + 1] != 0) {
          could_be_utf32_be = false;
        }
        if (!could_be_utf32_le && !could_be_utf32_be)
          break;
      }

      if (could_be_utf32_le) {
        result.encoding = CharEncoding::UTF32_LE;
        result.confidence = 0.8;
        result.needs_transcoding = true;
        return result;
      }
      if (could_be_utf32_be) {
        result.encoding = CharEncoding::UTF32_BE;
        result.confidence = 0.8;
        result.needs_transcoding = true;
        return result;
      }
    }
  }

  // Check for UTF-16 null byte patterns
  if (size >= 2) {
    size_t null_even = 0; // Nulls at even positions (UTF-16BE pattern)
    size_t null_odd = 0;  // Nulls at odd positions (UTF-16LE pattern)
    size_t check_bytes = std::min(size, size_t(256));
    // Ensure we check a multiple of 2 bytes
    check_bytes = (check_bytes / 2) * 2;

    for (size_t i = 0; i < check_bytes; i += 2) {
      if (data[i] == 0)
        null_even++;
      if (data[i + 1] == 0)
        null_odd++;
    }

    size_t pairs = check_bytes / 2;
    // If most odd bytes are null -> UTF-16LE (ASCII range)
    if (null_odd > pairs / 2 && null_even < pairs / 4) {
      result.encoding = CharEncoding::UTF16_LE;
      result.confidence = 0.7;
      result.needs_transcoding = true;
      return result;
    }
    // If most even bytes are null -> UTF-16BE
    if (null_even > pairs / 2 && null_odd < pairs / 4) {
      result.encoding = CharEncoding::UTF16_BE;
      result.confidence = 0.7;
      result.needs_transcoding = true;
      return result;
    }
  }

  // No null byte patterns found. Check if it's valid UTF-8.
  // Sample the first 8KB for detection — validating the entire file is O(n) and
  // dominated benchmark time for large ASCII/UTF-8 files. Any non-UTF-8 bytes
  // will almost certainly appear in the first few KB of a file.
  const size_t utf8_sample = std::min(size, size_t(8192));
  if (simdutf::validate_utf8(reinterpret_cast<const char*>(data), utf8_sample)) {
    result.encoding = CharEncoding::UTF8;
    // Lower confidence when sampling vs full validation
    result.confidence = (size <= 8192) ? 1.0 : 0.95;
    result.needs_transcoding = false;
    return result;
  }

  // Not valid UTF-8 and not UTF-16/32 — likely a single-byte encoding.
  // Check for Windows-1252 bytes (0x80-0x9F range differs from Latin-1).
  if (has_windows1252_bytes(data, std::min(size, size_t(4096)))) {
    result.encoding = CharEncoding::WINDOWS_1252;
    result.confidence = 0.6;
    result.needs_transcoding = true;
    return result;
  }

  // Default to Latin-1 for any non-UTF-8 single-byte data
  result.encoding = CharEncoding::LATIN1;
  result.confidence = 0.5;
  result.needs_transcoding = true;
  return result;
}

// Transcode Windows-1252 to UTF-8.
// Handles the 0x80-0x9F range via lookup table, delegates 0xA0-0xFF to Latin-1 path.
static AlignedBuffer transcode_windows1252_to_utf8(const uint8_t* data, size_t size) {
  // Worst case: each byte could become 3 UTF-8 bytes (for BMP code points)
  AlignedBuffer buf = AlignedBuffer::allocate(size * 3);
  uint8_t* out = buf.data();
  size_t out_len = 0;

  for (size_t i = 0; i < size; ++i) {
    uint8_t byte = data[i];
    if (byte < 0x80) {
      // ASCII: pass through
      out[out_len++] = byte;
    } else if (byte >= 0x80 && byte <= 0x9F) {
      // Windows-1252 specific range
      uint32_t cp = windows1252_to_unicode[byte - 0x80];
      if (cp == 0) {
        // Undefined: use U+FFFD replacement character
        cp = 0xFFFD;
      }
      out_len += encode_utf8(cp, out + out_len);
    } else {
      // 0xA0-0xFF: same as Latin-1 (Unicode code point == byte value)
      out_len += encode_utf8(static_cast<uint32_t>(byte), out + out_len);
    }
  }

  // AlignedBuffer has no resize(), so we allocate a right-sized copy.
  // Acceptable since Windows-1252 files are typically small.
  AlignedBuffer result = AlignedBuffer::allocate(out_len);
  std::memcpy(result.data(), buf.data(), out_len);
  return result;
}

AlignedBuffer transcode_to_utf8(const uint8_t* data, size_t size, CharEncoding enc,
                                size_t bom_length) {
  // Skip BOM
  const uint8_t* src = data + bom_length;
  size_t src_size = size - bom_length;

  switch (enc) {
  case CharEncoding::UTF8:
  case CharEncoding::UTF8_BOM: {
    // Just copy without BOM
    AlignedBuffer buf = AlignedBuffer::allocate(src_size);
    std::memcpy(buf.data(), src, src_size);
    return buf;
  }

  case CharEncoding::UTF16_LE: {
    // Copy into aligned char16_t buffer to avoid UB from misaligned reinterpret_cast.
    // Current callers (mmap, AlignedBuffer) are 64-byte aligned with even BOM sizes,
    // but this is a public API so we don't assume alignment.
    size_t src16_len = src_size / 2;
    std::vector<char16_t> aligned16(src16_len);
    std::memcpy(aligned16.data(), src, src16_len * sizeof(char16_t));

    size_t utf8_len = simdutf::utf8_length_from_utf16le(aligned16.data(), src16_len);
    AlignedBuffer buf = AlignedBuffer::allocate(utf8_len);

    size_t written = simdutf::convert_utf16le_to_utf8(aligned16.data(), src16_len,
                                                      reinterpret_cast<char*>(buf.data()));
    if (written == 0 && src16_len > 0) {
      throw std::runtime_error("Failed to transcode UTF-16LE to UTF-8");
    }

    if (written != utf8_len) {
      AlignedBuffer result = AlignedBuffer::allocate(written);
      std::memcpy(result.data(), buf.data(), written);
      return result;
    }
    return buf;
  }

  case CharEncoding::UTF16_BE: {
    size_t src16_len = src_size / 2;
    std::vector<char16_t> aligned16(src16_len);
    std::memcpy(aligned16.data(), src, src16_len * sizeof(char16_t));

    size_t utf8_len = simdutf::utf8_length_from_utf16be(aligned16.data(), src16_len);
    AlignedBuffer buf = AlignedBuffer::allocate(utf8_len);

    size_t written = simdutf::convert_utf16be_to_utf8(aligned16.data(), src16_len,
                                                      reinterpret_cast<char*>(buf.data()));
    if (written == 0 && src16_len > 0) {
      throw std::runtime_error("Failed to transcode UTF-16BE to UTF-8");
    }

    if (written != utf8_len) {
      AlignedBuffer result = AlignedBuffer::allocate(written);
      std::memcpy(result.data(), buf.data(), written);
      return result;
    }
    return buf;
  }

  case CharEncoding::UTF32_LE: {
    size_t src32_len = src_size / 4;
    std::vector<char32_t> aligned32(src32_len);
    std::memcpy(aligned32.data(), src, src32_len * sizeof(char32_t));

#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
    // Byte-swap for big-endian machines
    for (size_t i = 0; i < src32_len; ++i) {
      uint32_t val = static_cast<uint32_t>(aligned32[i]);
      aligned32[i] = static_cast<char32_t>(__builtin_bswap32(val));
    }
#endif

    size_t utf8_len = simdutf::utf8_length_from_utf32(aligned32.data(), src32_len);
    AlignedBuffer buf = AlignedBuffer::allocate(utf8_len);

    size_t written = simdutf::convert_utf32_to_utf8(aligned32.data(), src32_len,
                                                    reinterpret_cast<char*>(buf.data()));
    if (written == 0 && src32_len > 0) {
      throw std::runtime_error("Failed to transcode UTF-32LE to UTF-8");
    }

    if (written != utf8_len) {
      AlignedBuffer result = AlignedBuffer::allocate(written);
      std::memcpy(result.data(), buf.data(), written);
      return result;
    }
    return buf;
  }

  case CharEncoding::UTF32_BE: {
    size_t src32_len = src_size / 4;
    std::vector<char32_t> aligned32(src32_len);
    std::memcpy(aligned32.data(), src, src32_len * sizeof(char32_t));

#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
    // Byte-swap for little-endian machines (common case)
    for (size_t i = 0; i < src32_len; ++i) {
      uint32_t val = static_cast<uint32_t>(aligned32[i]);
      aligned32[i] = static_cast<char32_t>(__builtin_bswap32(val));
    }
#endif

    size_t utf8_len = simdutf::utf8_length_from_utf32(aligned32.data(), src32_len);
    AlignedBuffer buf = AlignedBuffer::allocate(utf8_len);

    size_t written = simdutf::convert_utf32_to_utf8(aligned32.data(), src32_len,
                                                    reinterpret_cast<char*>(buf.data()));
    if (written == 0 && src32_len > 0) {
      throw std::runtime_error("Failed to transcode UTF-32BE to UTF-8");
    }

    if (written != utf8_len) {
      AlignedBuffer result = AlignedBuffer::allocate(written);
      std::memcpy(result.data(), buf.data(), written);
      return result;
    }
    return buf;
  }

  case CharEncoding::LATIN1: {
    size_t utf8_len =
        simdutf::utf8_length_from_latin1(reinterpret_cast<const char*>(src), src_size);
    AlignedBuffer buf = AlignedBuffer::allocate(utf8_len);

    size_t written = simdutf::convert_latin1_to_utf8(reinterpret_cast<const char*>(src), src_size,
                                                     reinterpret_cast<char*>(buf.data()));
    if (written == 0 && src_size > 0) {
      throw std::runtime_error("Failed to transcode Latin-1 to UTF-8");
    }

    if (written != utf8_len) {
      AlignedBuffer result = AlignedBuffer::allocate(written);
      std::memcpy(result.data(), buf.data(), written);
      return result;
    }
    return buf;
  }

  case CharEncoding::WINDOWS_1252: {
    return transcode_windows1252_to_utf8(src, src_size);
  }

  case CharEncoding::UNKNOWN:
    throw std::runtime_error("Cannot transcode from unknown encoding");
  }

  throw std::runtime_error("Unsupported encoding for transcoding");
}

} // namespace libvroom
Updated on 2026-02-16 at 19:19:38 +0000