dialect.cpp

dialect.cpp

Implementation of CSV dialect detection algorithm.

Namespaces

Name
libvroom

Source code


#include "libvroom/dialect.h"

#include "libvroom/io_util.h"

#include <algorithm>
#include <cassert>
#include <cctype>
#include <cmath>
#include <cstring>
#include <fstream>
#include <sstream>
#include <unordered_map>

namespace libvroom {

// ============================================================================
// Simple type validation (replaces SIMDTypeValidator for dialect detection)
// ============================================================================
// These are simpler scalar implementations used for dialect scoring.
// They don't need to be as fast as the main parsing path since they only
// run on a small sample of data during detection.

namespace {

inline bool could_be_integer(const uint8_t* data, size_t len) {
  if (len == 0)
    return false;

  size_t start = 0;

  // Skip leading whitespace
  while (start < len && (data[start] == ' ' || data[start] == '\t'))
    ++start;

  // Skip trailing whitespace
  size_t end = len;
  while (end > start && (data[end - 1] == ' ' || data[end - 1] == '\t'))
    --end;

  if (start >= end)
    return false;

  // Handle sign
  if (data[start] == '-' || data[start] == '+') {
    ++start;
    if (start >= end)
      return false;
  }

  // All remaining characters must be digits
  for (size_t i = start; i < end; ++i) {
    if (data[i] < '0' || data[i] > '9') {
      return false;
    }
  }

  return true;
}

inline bool could_be_float(const uint8_t* data, size_t len) {
  if (len == 0)
    return false;

  size_t start = 0;
  size_t end = len;

  // Skip whitespace
  while (start < end && (data[start] == ' ' || data[start] == '\t'))
    ++start;
  while (end > start && (data[end - 1] == ' ' || data[end - 1] == '\t'))
    --end;

  if (start >= end)
    return false;

  const uint8_t* ptr = data + start;
  size_t remaining = end - start;

  // Check for special values (nan, inf)
  if (remaining >= 3) {
    auto to_lower = [](uint8_t c) -> uint8_t { return (c >= 'A' && c <= 'Z') ? (c + 32) : c; };

    if (remaining == 3 && to_lower(ptr[0]) == 'n' && to_lower(ptr[1]) == 'a' &&
        to_lower(ptr[2]) == 'n') {
      return true;
    }

    size_t offset = 0;
    if (ptr[0] == '-' || ptr[0] == '+')
      offset = 1;

    if (remaining - offset >= 3 && to_lower(ptr[offset]) == 'i' &&
        to_lower(ptr[offset + 1]) == 'n' && to_lower(ptr[offset + 2]) == 'f') {
      if (remaining - offset == 3)
        return true;
      if (remaining - offset == 8 && to_lower(ptr[offset + 3]) == 'i' &&
          to_lower(ptr[offset + 4]) == 'n' && to_lower(ptr[offset + 5]) == 'i' &&
          to_lower(ptr[offset + 6]) == 't' && to_lower(ptr[offset + 7]) == 'y') {
        return true;
      }
    }
  }

  // Regular float: sign? digits* .? digits* (e sign? digits+)?
  size_t i = 0;
  bool has_digit = false;
  bool has_decimal = false;
  bool has_exponent = false;

  // Sign
  if (ptr[i] == '-' || ptr[i] == '+')
    ++i;

  // Integer part
  while (i < remaining && ptr[i] >= '0' && ptr[i] <= '9') {
    has_digit = true;
    ++i;
  }

  // Decimal point
  if (i < remaining && ptr[i] == '.') {
    has_decimal = true;
    ++i;

    // Fractional part
    while (i < remaining && ptr[i] >= '0' && ptr[i] <= '9') {
      has_digit = true;
      ++i;
    }
  }

  // Exponent
  if (i < remaining && (ptr[i] == 'e' || ptr[i] == 'E')) {
    has_exponent = true;
    ++i;

    if (i < remaining && (ptr[i] == '-' || ptr[i] == '+'))
      ++i;

    if (i >= remaining || ptr[i] < '0' || ptr[i] > '9') {
      return false; // Exponent requires digits
    }

    while (i < remaining && ptr[i] >= '0' && ptr[i] <= '9')
      ++i;
  }

  // Must have digits and be at end
  // Also, must have either decimal or exponent to be a float (not integer)
  return has_digit && (has_decimal || has_exponent) && i == remaining;
}

void validate_batch(const uint8_t** fields, const size_t* lengths, size_t count,
                    size_t& integer_count, size_t& float_count, size_t& other_count) {
  integer_count = 0;
  float_count = 0;
  other_count = 0;

  for (size_t i = 0; i < count; ++i) {
    if (could_be_integer(fields[i], lengths[i])) {
      integer_count++;
    } else if (could_be_float(fields[i], lengths[i])) {
      float_count++;
    } else {
      other_count++;
    }
  }
}

} // namespace

// ============================================================================
// Constants for dialect scoring
// ============================================================================

constexpr double ESCAPE_PATTERN_MATCH_BOOST = 1.2;

constexpr double DOUBLE_QUOTE_ESCAPE_BOOST = 1.1;

// ============================================================================
// Dialect
// ============================================================================

std::string Dialect::to_string() const {
  std::ostringstream ss;
  ss << "Dialect{delimiter=";

  // Format special characters nicely
  switch (delimiter) {
  case ',':
    ss << "','";
    break;
  case '\t':
    ss << "'\\t'";
    break;
  case ';':
    ss << "';'";
    break;
  case '|':
    ss << "'|'";
    break;
  case ':':
    ss << "':'";
    break;
  default:
    ss << "'" << delimiter << "'";
    break;
  }

  ss << ", quote=";
  if (quote_char == '"') {
    ss << "'\"'";
  } else if (quote_char == '\'') {
    ss << "\"'\"";
  } else if (quote_char == '\0') {
    ss << "none";
  } else {
    ss << "'" << quote_char << "'";
  }

  ss << ", escape=";
  if (double_quote) {
    ss << "double";
  } else if (escape_char == '\\') {
    ss << "backslash";
  } else {
    ss << "'" << escape_char << "'";
  }

  if (!comment_str.empty()) {
    ss << ", comment=\"" << comment_str << "\"";
  }

  ss << "}";
  return ss.str();
}

// ============================================================================
// DialectDetector
// ============================================================================

DialectDetector::DialectDetector(const DetectionOptions& options) : options_(options) {}

DetectionResult DialectDetector::detect(const uint8_t* buf, size_t len) const {
  DetectionResult result;

  if (buf == nullptr || len == 0) {
    result.warning = "Empty or null input";
    return result;
  }

  // Skip leading comment lines before dialect detection
  std::string comment_str;
  size_t comment_lines_skipped = 0;
  size_t comment_offset = skip_comment_lines(buf, len, comment_str, comment_lines_skipped);

  // Record detected comment info
  result.comment_str = comment_str;
  result.comment_lines_skipped = comment_lines_skipped;

  // Adjust buffer to skip comment lines
  const uint8_t* data_buf = buf + comment_offset;
  size_t data_len = len - comment_offset;

  if (data_len == 0) {
    result.warning = "File contains only comment lines";
    return result;
  }

  // Calculate adaptive sample size based on first row length.
  // For wide CSV files (many columns), rows can be very long.
  // We need at least min_rows complete rows for pattern detection.
  size_t effective_sample_size = options_.sample_size;

  // Find the first newline to estimate row length
  size_t first_newline = 0;
  for (size_t i = 0; i < std::min(data_len, options_.sample_size); ++i) {
    if (data_buf[i] == '\n') {
      first_newline = i + 1; // Include the newline
      break;
    }
  }

  // If we found a newline and the row is long, increase sample size
  // to ensure we can get at least min_rows complete rows
  if (first_newline > 0) {
    // Estimate bytes needed: first_row_len * (min_rows + 1) to be safe
    // The +1 accounts for potential variation in row lengths
    size_t estimated_needed = first_newline * (options_.min_rows + 1);
    if (estimated_needed > effective_sample_size) {
      // Cap at a reasonable maximum (1MB) to avoid excessive memory use
      constexpr size_t MAX_ADAPTIVE_SAMPLE = 1024 * 1024;
      effective_sample_size = std::min(estimated_needed, MAX_ADAPTIVE_SAMPLE);
    }
  } else if (data_len > options_.sample_size) {
    // No newline found in initial sample - this means rows are very long
    // Expand sample to try to capture at least one complete row
    // Use 4x the default sample size as a heuristic
    constexpr size_t MAX_ADAPTIVE_SAMPLE = 1024 * 1024;
    effective_sample_size = std::min(options_.sample_size * 4, MAX_ADAPTIVE_SAMPLE);
  }

  // Limit to actual data size and effective sample size
  size_t sample_len = std::min(data_len, effective_sample_size);

  // Detect line ending style
  result.dialect.line_ending = detect_line_ending(data_buf, sample_len);

  // Generate all candidate dialects
  auto candidates = generate_candidates();

  // Score each candidate
  for (const auto& dialect : candidates) {
    auto candidate = score_dialect(dialect, data_buf, sample_len);
    result.candidates.push_back(candidate);
  }

  // Sort by consistency score (best first)
  std::sort(result.candidates.begin(), result.candidates.end());

  // Select best candidate
  if (!result.candidates.empty() && result.candidates[0].consistency_score > 0) {
    const auto& best = result.candidates[0];
    result.dialect = best.dialect;
    result.dialect.line_ending = detect_line_ending(data_buf, sample_len);
    result.dialect.comment_str = comment_str; // Propagate detected comment string
    result.confidence = best.consistency_score;
    result.detected_columns = best.num_columns;

    // Detect header
    result.has_header = detect_header(result.dialect, data_buf, sample_len);

    // Count rows analyzed
    auto rows = find_rows(result.dialect, data_buf, sample_len);
    result.rows_analyzed = rows.size();

    // Check for ambiguous cases (multiple candidates with similar scores)
    if (result.candidates.size() > 1) {
      double second_score = result.candidates[1].consistency_score;
      if (second_score > 0.9 * best.consistency_score) {
        result.warning = "Multiple dialects have similar scores; detection may be ambiguous";
      }
    }
  } else {
    result.warning = "Could not detect a valid CSV dialect";
  }

  return result;
}

DetectionResult DialectDetector::detect_file(const std::string& filename) const {
  // Read sample from file
  std::ifstream file(filename, std::ios::binary);
  if (!file) {
    DetectionResult result;
    result.warning = "Could not open file: " + filename;
    return result;
  }

  std::vector<uint8_t> buffer(options_.sample_size);
  file.read(reinterpret_cast<char*>(buffer.data()), buffer.size());
  size_t bytes_read = static_cast<size_t>(file.gcount());

  return detect(buffer.data(), bytes_read);
}

std::vector<Dialect> DialectDetector::generate_candidates() const {
  std::vector<Dialect> candidates;

  // Generate all combinations of delimiter, quote char, and escape style
  for (char delim : options_.delimiters) {
    for (char quote : options_.quote_chars) {
      // Test double-quote escaping (RFC 4180 style: "" -> ")
      {
        Dialect d;
        d.delimiter = delim;
        d.quote_char = quote;
        d.escape_char = quote;
        d.double_quote = true;
        candidates.push_back(d);
      }

      // Test each escape character (e.g., backslash: \" -> ")
      for (char esc : options_.escape_chars) {
        if (esc != quote) { // Skip if same as quote (handled above)
          Dialect d;
          d.delimiter = delim;
          d.quote_char = quote;
          d.escape_char = esc;
          d.double_quote = false;
          candidates.push_back(d);
        }
      }
    }
    // Also test without quotes
    Dialect d;
    d.delimiter = delim;
    d.quote_char = '\0';
    d.escape_char = '\0';
    d.double_quote = false;
    candidates.push_back(d);
  }

  return candidates;
}

// Helper: detect escape pattern usage in data
// Returns: negative for double-quote pattern (RFC 4180), positive for escape-char preference
static int detect_escape_pattern(const uint8_t* buf, size_t len, char quote_char,
                                 char escape_char) {
  int escape_char_count = 0;
  int double_quote_count = 0;

  for (size_t i = 0; i + 1 < len; ++i) {
    // Check for escape_char followed by quote_char (e.g., \")
    // Only count if escape_char is different from quote_char
    if (escape_char != quote_char && buf[i] == static_cast<uint8_t>(escape_char) &&
        buf[i + 1] == static_cast<uint8_t>(quote_char)) {
      escape_char_count++;
    }
    // Check for double-quote pattern (e.g., "")
    if (buf[i] == static_cast<uint8_t>(quote_char) &&
        buf[i + 1] == static_cast<uint8_t>(quote_char)) {
      double_quote_count++;
    }
  }

  // Return negative for double-quote preference, positive for escape-char preference
  if (escape_char_count > 0 && double_quote_count == 0) {
    return escape_char_count; // Escape char pattern detected (e.g., \")
  } else if (double_quote_count > 0 && escape_char_count == 0) {
    return -double_quote_count; // Double-quote pattern detected (e.g., "")
  }
  return 0; // Ambiguous or no escapes
}

DialectCandidate DialectDetector::score_dialect(const Dialect& dialect, const uint8_t* buf,
                                                size_t len) const {

  DialectCandidate candidate;
  candidate.dialect = dialect;

  std::vector<size_t> row_field_counts;
  candidate.pattern_score = compute_pattern_score(dialect, buf, len, row_field_counts);

  if (row_field_counts.empty()) {
    return candidate; // No rows found
  }

  // Find modal column count
  std::unordered_map<size_t, size_t> count_freq;
  for (size_t c : row_field_counts) {
    count_freq[c]++;
  }
  size_t modal_count = 0;
  size_t modal_freq = 0;
  for (const auto& [count, freq] : count_freq) {
    if (freq > modal_freq) {
      modal_freq = freq;
      modal_count = count;
    }
  }
  candidate.num_columns = modal_count;

  // Compute type score
  candidate.type_score = compute_type_score(dialect, buf, len);

  // Combined consistency score
  // Use pattern_score as primary signal, with type_score as a bonus
  // This handles string-heavy files that would otherwise get penalized
  if (candidate.pattern_score > 0.9 && candidate.num_columns > 1) {
    // For highly consistent row patterns with multiple columns,
    // give a strong baseline score even if type_score is low.
    // Files with all strings are valid CSVs and should be detected.
    candidate.consistency_score =
        candidate.pattern_score * std::max(0.6, std::sqrt(std::max(0.1, candidate.type_score)));
  } else if (candidate.pattern_score > 0.8 && candidate.num_columns > 1) {
    candidate.consistency_score =
        candidate.pattern_score * std::sqrt(std::max(0.1, candidate.type_score));
  } else {
    candidate.consistency_score = candidate.pattern_score * candidate.type_score;
  }

  // Boost score based on escape pattern match
  // This helps distinguish dialects that produce similar field counts
  // but use different escape mechanisms.
  // Note: When both \" and "" patterns are present, returns 0 (ambiguous),
  // and no boost is applied - the tie-breakers will decide.
  if (dialect.quote_char != '\0') {
    char esc_to_check = dialect.double_quote ? '\0' : dialect.escape_char;
    if (esc_to_check != '\0') {
      int escape_signal = detect_escape_pattern(buf, len, dialect.quote_char, esc_to_check);
      if (escape_signal > 0 && !dialect.double_quote) {
        // Backslash escapes detected and this dialect uses backslash escaping
        candidate.consistency_score *= ESCAPE_PATTERN_MATCH_BOOST;
      } else if (escape_signal < 0 && dialect.double_quote) {
        // Double-quote escapes detected and this dialect uses double-quote
        candidate.consistency_score *= ESCAPE_PATTERN_MATCH_BOOST;
      }
    } else if (dialect.double_quote) {
      // Check if double-quote escapes are present
      int escape_signal = detect_escape_pattern(buf, len, dialect.quote_char, dialect.quote_char);
      if (escape_signal < 0) {
        // Double-quote escapes detected
        candidate.consistency_score *= DOUBLE_QUOTE_ESCAPE_BOOST;
      }
    }
  }

  return candidate;
}

double DialectDetector::compute_pattern_score(const Dialect& dialect, const uint8_t* buf,
                                              size_t len,
                                              std::vector<size_t>& row_field_counts) const {

  row_field_counts.clear();

  auto rows = find_rows(dialect, buf, len);
  if (rows.size() < options_.min_rows) {
    return 0.0;
  }

  // Count fields in each row
  for (const auto& [start, end] : rows) {
    assert(end >= start && "Invalid row range: end must be >= start");
    auto fields = extract_fields(dialect, buf + start, end - start);
    row_field_counts.push_back(fields.size());
  }

  if (row_field_counts.empty()) {
    return 0.0;
  }

  // Calculate consistency: fraction of rows matching modal field count
  std::unordered_map<size_t, size_t> count_freq;
  for (size_t c : row_field_counts) {
    count_freq[c]++;
  }

  size_t modal_freq = 0;
  for (const auto& [count, freq] : count_freq) {
    modal_freq = std::max(modal_freq, freq);
  }

  return static_cast<double>(modal_freq) / row_field_counts.size();
}

double DialectDetector::compute_type_score(const Dialect& dialect, const uint8_t* buf,
                                           size_t len) const {

  auto rows = find_rows(dialect, buf, len);
  if (rows.empty()) {
    return 0.0;
  }

  size_t typed_cells = 0;
  size_t total_cells = 0;

  // Skip first row if it might be a header
  size_t start_row = (rows.size() > 1) ? 1 : 0;

  // Collect all fields for batch processing.
  // Note: field_ptrs point into the input buffer `buf` (via extract_fields),
  // so they remain valid throughout this function's scope.
  std::vector<const uint8_t*> field_ptrs;
  std::vector<size_t> field_lengths;

  // Pre-allocate based on estimated fields
  size_t estimated_fields = (rows.size() - start_row) * 10;
  field_ptrs.reserve(estimated_fields);
  field_lengths.reserve(estimated_fields);

  for (size_t i = start_row; i < rows.size(); ++i) {
    const auto& [start, end] = rows[i];
    assert(end >= start && "Invalid row range: end must be >= start");
    auto fields = extract_fields(dialect, buf + start, end - start);

    for (const auto& field : fields) {
      field_ptrs.push_back(reinterpret_cast<const uint8_t*>(field.data()));
      field_lengths.push_back(field.size());
      total_cells++;
    }
  }

  if (total_cells == 0) {
    return 0.0;
  }

  // Use batch validation for integer/float detection.
  size_t integer_count = 0;
  size_t float_count = 0;
  size_t other_count = 0;

  validate_batch(field_ptrs.data(), field_lengths.data(), total_cells, integer_count, float_count,
                 other_count);

  // Integer and float cells are definitely typed
  typed_cells = integer_count + float_count;

  // For non-numeric fields, check if they're other typed values using
  // infer_cell_type(). We only call this for fields in the "other" category.
  if (other_count > 0) {
    for (size_t i = 0; i < total_cells; ++i) {
      const uint8_t* data = field_ptrs[i];
      size_t field_len = field_lengths[i];

      // Skip cells already counted as integer or float
      if (could_be_integer(data, field_len) || could_be_float(data, field_len)) {
        continue;
      }

      // Use infer_cell_type for non-numeric fields to detect:
      // empty, boolean, date, time, datetime
      std::string_view sv(reinterpret_cast<const char*>(data), field_len);
      CellType type = infer_cell_type(sv);
      if (type != CellType::STRING) {
        typed_cells++;
      }
    }
  }

  // Add small epsilon to avoid zero scores
  constexpr double eps = 1e-10;
  return std::max(eps, static_cast<double>(typed_cells) / total_cells);
}

Dialect::LineEnding DialectDetector::detect_line_ending(const uint8_t* buf, size_t len) {

  bool has_crlf = false;
  bool has_lf = false;
  bool has_cr = false;

  for (size_t i = 0; i < len; ++i) {
    if (buf[i] == '\r') {
      if (i + 1 < len && buf[i + 1] == '\n') {
        has_crlf = true;
        ++i; // Skip the \n
      } else {
        has_cr = true;
      }
    } else if (buf[i] == '\n') {
      has_lf = true;
    }
  }

  int count = (has_crlf ? 1 : 0) + (has_lf ? 1 : 0) + (has_cr ? 1 : 0);
  if (count > 1) {
    return Dialect::LineEnding::MIXED;
  }
  if (has_crlf)
    return Dialect::LineEnding::CRLF;
  if (has_lf)
    return Dialect::LineEnding::LF;
  if (has_cr)
    return Dialect::LineEnding::CR;
  return Dialect::LineEnding::UNKNOWN;
}

bool DialectDetector::detect_header(const Dialect& dialect, const uint8_t* buf, size_t len) const {

  auto rows = find_rows(dialect, buf, len);
  if (rows.size() < 2) {
    return false;
  }

  // Extract first two rows
  auto header_fields = extract_fields(dialect, buf + rows[0].first, rows[0].second - rows[0].first);
  auto data_fields = extract_fields(dialect, buf + rows[1].first, rows[1].second - rows[1].first);

  if (header_fields.empty() || data_fields.empty()) {
    return false;
  }

  // Heuristics for header detection:
  // 1. Header cells are typically all strings (non-empty)
  // 2. Data row has different types than header

  size_t header_strings = 0;
  size_t header_non_empty = 0;

  for (const auto& field : header_fields) {
    CellType type = infer_cell_type(field);
    if (type == CellType::STRING && !field.empty()) {
      header_strings++;
    }
    if (!field.empty()) {
      header_non_empty++;
    }
  }

  size_t data_non_strings = 0;
  for (const auto& field : data_fields) {
    CellType type = infer_cell_type(field);
    if (type != CellType::STRING && type != CellType::EMPTY) {
      data_non_strings++;
    }
  }

  // Header likely if:
  // - Most header cells are non-empty strings
  // - Data row has some typed (non-string) values, OR all header cells are strings
  double string_ratio =
      header_non_empty > 0 ? static_cast<double>(header_strings) / header_non_empty : 0.0;

  return (string_ratio > 0.5) && (data_non_strings > 0 || header_strings == header_fields.size());
}

bool DialectDetector::is_comment_line(const uint8_t* row_start, size_t row_len) const {
  if (options_.comment_chars.empty() || row_len == 0) {
    return false;
  }

  // Skip leading whitespace
  size_t i = 0;
  while (i < row_len && (row_start[i] == ' ' || row_start[i] == '\t')) {
    i++;
  }

  if (i >= row_len) {
    return false; // Empty line (all whitespace)
  }

  // Check if the remaining content starts with any comment string
  size_t remaining = row_len - i;
  const char* data = reinterpret_cast<const char*>(row_start + i);
  for (const auto& cs : options_.comment_chars) {
    if (!cs.empty() && remaining >= cs.size() && std::memcmp(data, cs.data(), cs.size()) == 0) {
      return true;
    }
  }

  return false;
}

std::vector<std::pair<size_t, size_t>>
DialectDetector::find_rows(const Dialect& dialect, const uint8_t* buf, size_t len) const {

  std::vector<std::pair<size_t, size_t>> rows;
  if (len == 0)
    return rows;

  bool in_quote = false;
  size_t row_start = 0;

  for (size_t i = 0; i < len; ++i) {
    uint8_t c = buf[i];

    // Handle escape character (backslash or other)
    // When we see an escape char, we skip both it and the next character.
    // Note: ++i here plus the for-loop's ++i after continue = skip 2 chars total
    if (!dialect.double_quote && dialect.escape_char != '\0' &&
        c == static_cast<uint8_t>(dialect.escape_char) && i + 1 < len) {
      ++i; // Move to escaped char; for-loop ++i will move past it
      continue;
    }

    if (dialect.quote_char != '\0' && c == static_cast<uint8_t>(dialect.quote_char)) {
      // Handle double-quote escaping (RFC 4180 style)
      if (dialect.double_quote && i + 1 < len &&
          buf[i + 1] == static_cast<uint8_t>(dialect.quote_char)) {
        ++i; // Skip escaped quote
      } else {
        in_quote = !in_quote;
      }
    } else if (!in_quote) {
      if (c == '\n') {
        // Handle CRLF
        size_t row_end = i;
        if (row_end > row_start && buf[row_end - 1] == '\r') {
          row_end--;
        }
        if (row_end > row_start) { // Non-empty row
          // Skip comment lines
          if (!is_comment_line(buf + row_start, row_end - row_start)) {
            rows.emplace_back(row_start, row_end);
          }
        }
        row_start = i + 1;

        if (rows.size() >= options_.max_rows) {
          break;
        }
      } else if (c == '\r' && (i + 1 >= len || buf[i + 1] != '\n')) {
        // CR not followed by LF (old Mac style)
        if (i > row_start) {
          // Skip comment lines
          if (!is_comment_line(buf + row_start, i - row_start)) {
            rows.emplace_back(row_start, i);
          }
        }
        row_start = i + 1;

        if (rows.size() >= options_.max_rows) {
          break;
        }
      }
    }
  }

  // Handle last row without trailing newline
  if (row_start < len && rows.size() < options_.max_rows) {
    // Skip comment lines
    if (!is_comment_line(buf + row_start, len - row_start)) {
      rows.emplace_back(row_start, len);
    }
  }

  return rows;
}

std::vector<std::string_view> DialectDetector::extract_fields(const Dialect& dialect,
                                                              const uint8_t* row_start,
                                                              size_t row_len) const {

  std::vector<std::string_view> fields;
  if (row_len == 0)
    return fields;

  const char* data = reinterpret_cast<const char*>(row_start);
  bool in_quote = false;
  size_t field_start = 0;

  for (size_t i = 0; i < row_len; ++i) {
    char c = data[i];

    // Handle escape character (backslash or other)
    // When we see an escape char, we skip both it and the next character.
    // Note: ++i here plus the for-loop's ++i after continue = skip 2 chars total
    if (!dialect.double_quote && dialect.escape_char != '\0' && c == dialect.escape_char &&
        i + 1 < row_len) {
      ++i; // Move to escaped char; for-loop ++i will move past it
      continue;
    }

    if (dialect.quote_char != '\0' && c == dialect.quote_char) {
      if (dialect.double_quote && i + 1 < row_len && data[i + 1] == dialect.quote_char) {
        ++i; // Skip escaped quote
      } else {
        in_quote = !in_quote;
      }
    } else if (!in_quote && c == dialect.delimiter) {
      // End of field
      fields.emplace_back(data + field_start, i - field_start);
      field_start = i + 1;
    }
  }

  // Add last field
  fields.emplace_back(data + field_start, row_len - field_start);

  // Remove quotes from quoted fields
  for (auto& field : fields) {
    if (field.size() >= 2 && dialect.quote_char != '\0' && field.front() == dialect.quote_char &&
        field.back() == dialect.quote_char) {
      field = field.substr(1, field.size() - 2);
    }
  }

  return fields;
}

DialectDetector::CellType DialectDetector::infer_cell_type(std::string_view cell) {
  // Trim whitespace
  while (!cell.empty() && std::isspace(static_cast<unsigned char>(cell.front()))) {
    cell.remove_prefix(1);
  }
  while (!cell.empty() && std::isspace(static_cast<unsigned char>(cell.back()))) {
    cell.remove_suffix(1);
  }

  if (cell.empty()) {
    return CellType::EMPTY;
  }

  // Boolean check
  if (cell == "true" || cell == "false" || cell == "TRUE" || cell == "FALSE" || cell == "True" ||
      cell == "False") {
    return CellType::BOOLEAN;
  }

  // Try parsing as integer
  {
    size_t i = 0;
    if (cell[0] == '+' || cell[0] == '-')
      i++;
    if (i < cell.size() && std::isdigit(static_cast<unsigned char>(cell[i]))) {
      bool all_digits = true;
      for (; i < cell.size() && all_digits; ++i) {
        if (!std::isdigit(static_cast<unsigned char>(cell[i]))) {
          all_digits = false;
        }
      }
      if (all_digits && i == cell.size()) {
        return CellType::INTEGER;
      }
    }
  }

  // Try parsing as float
  {
    size_t i = 0;
    if (cell[0] == '+' || cell[0] == '-')
      i++;

    bool has_digits = false;
    bool has_dot = false;
    bool has_exp = false;
    bool valid = true;

    // Integer part
    while (i < cell.size() && std::isdigit(static_cast<unsigned char>(cell[i]))) {
      has_digits = true;
      i++;
    }

    // Decimal part
    if (i < cell.size() && cell[i] == '.') {
      has_dot = true;
      i++;
      while (i < cell.size() && std::isdigit(static_cast<unsigned char>(cell[i]))) {
        has_digits = true;
        i++;
      }
    }

    // Exponent part
    if (i < cell.size() && (cell[i] == 'e' || cell[i] == 'E')) {
      has_exp = true;
      i++;
      if (i < cell.size() && (cell[i] == '+' || cell[i] == '-'))
        i++;
      bool exp_digits = false;
      while (i < cell.size() && std::isdigit(static_cast<unsigned char>(cell[i]))) {
        exp_digits = true;
        i++;
      }
      if (!exp_digits)
        valid = false;
    }

    if (valid && has_digits && (has_dot || has_exp) && i == cell.size()) {
      return CellType::FLOAT;
    }
  }

  // Date patterns: YYYY-MM-DD, YYYY/MM/DD, DD-MM-YYYY, DD/MM/YYYY
  if (cell.size() >= 8 && cell.size() <= 10) {
    bool might_be_date = false;

    // YYYY-MM-DD or YYYY/MM/DD
    if (cell.size() == 10 && std::isdigit(static_cast<unsigned char>(cell[0])) &&
        std::isdigit(static_cast<unsigned char>(cell[1])) &&
        std::isdigit(static_cast<unsigned char>(cell[2])) &&
        std::isdigit(static_cast<unsigned char>(cell[3])) && (cell[4] == '-' || cell[4] == '/') &&
        std::isdigit(static_cast<unsigned char>(cell[5])) &&
        std::isdigit(static_cast<unsigned char>(cell[6])) && cell[7] == cell[4] &&
        std::isdigit(static_cast<unsigned char>(cell[8])) &&
        std::isdigit(static_cast<unsigned char>(cell[9]))) {
      might_be_date = true;
    }

    // DD-MM-YYYY or DD/MM/YYYY
    if (cell.size() == 10 && std::isdigit(static_cast<unsigned char>(cell[0])) &&
        std::isdigit(static_cast<unsigned char>(cell[1])) && (cell[2] == '-' || cell[2] == '/') &&
        std::isdigit(static_cast<unsigned char>(cell[3])) &&
        std::isdigit(static_cast<unsigned char>(cell[4])) && cell[5] == cell[2] &&
        std::isdigit(static_cast<unsigned char>(cell[6])) &&
        std::isdigit(static_cast<unsigned char>(cell[7])) &&
        std::isdigit(static_cast<unsigned char>(cell[8])) &&
        std::isdigit(static_cast<unsigned char>(cell[9]))) {
      might_be_date = true;
    }

    if (might_be_date) {
      return CellType::DATE;
    }
  }

  // Time pattern: HH:MM or HH:MM:SS
  if ((cell.size() == 5 || cell.size() == 8) && std::isdigit(static_cast<unsigned char>(cell[0])) &&
      std::isdigit(static_cast<unsigned char>(cell[1])) && cell[2] == ':' &&
      std::isdigit(static_cast<unsigned char>(cell[3])) &&
      std::isdigit(static_cast<unsigned char>(cell[4]))) {
    if (cell.size() == 5) {
      return CellType::TIME;
    }
    if (cell[5] == ':' && std::isdigit(static_cast<unsigned char>(cell[6])) &&
        std::isdigit(static_cast<unsigned char>(cell[7]))) {
      return CellType::TIME;
    }
  }

  // Datetime: date + T/space + time
  if (cell.size() >= 16) {
    size_t sep_pos = cell.find('T');
    if (sep_pos == std::string_view::npos) {
      sep_pos = cell.find(' ');
    }
    if (sep_pos != std::string_view::npos && sep_pos >= 8) {
      auto date_part = cell.substr(0, sep_pos);
      auto time_part = cell.substr(sep_pos + 1);

      // Remove timezone suffix if present
      if (!time_part.empty() && time_part.back() == 'Z') {
        time_part.remove_suffix(1);
      }
      // Handle +HH:MM timezone
      auto plus_pos = time_part.find('+');
      auto minus_pos = time_part.find('-');
      if (plus_pos != std::string_view::npos && plus_pos > 0) {
        time_part = time_part.substr(0, plus_pos);
      } else if (minus_pos != std::string_view::npos && minus_pos > 5) {
        time_part = time_part.substr(0, minus_pos);
      }

      if (infer_cell_type(date_part) == CellType::DATE &&
          (infer_cell_type(time_part) == CellType::TIME || time_part.size() >= 5)) {
        return CellType::DATETIME;
      }
    }
  }

  return CellType::STRING;
}

const char* DialectDetector::cell_type_to_string(CellType type) {
  switch (type) {
  case CellType::EMPTY:
    return "EMPTY";
  case CellType::INTEGER:
    return "INTEGER";
  case CellType::FLOAT:
    return "FLOAT";
  case CellType::DATE:
    return "DATE";
  case CellType::DATETIME:
    return "DATETIME";
  case CellType::TIME:
    return "TIME";
  case CellType::BOOLEAN:
    return "BOOLEAN";
  case CellType::STRING:
    return "STRING";
  default:
    return "UNKNOWN";
  }
}

size_t DialectDetector::skip_comment_lines(const uint8_t* buf, size_t len, std::string& comment_str,
                                           size_t& lines_skipped) const {
  comment_str.clear();
  lines_skipped = 0;

  if (buf == nullptr || len == 0 || options_.comment_chars.empty()) {
    return 0;
  }

  size_t offset = 0;

  while (offset < len) {
    // Skip leading whitespace on the line (spaces and tabs only)
    size_t line_start = offset;
    while (offset < len && (buf[offset] == ' ' || buf[offset] == '\t')) {
      offset++;
    }

    if (offset >= len) {
      break;
    }

    // Check if this line starts with any comment string
    bool is_comment = false;
    size_t remaining = len - offset;
    const char* data = reinterpret_cast<const char*>(buf + offset);

    for (const auto& cs : options_.comment_chars) {
      if (!cs.empty() && remaining >= cs.size() && std::memcmp(data, cs.data(), cs.size()) == 0) {
        is_comment = true;
        // Record the comment string (first one found wins)
        if (comment_str.empty()) {
          comment_str = cs;
        }
        break;
      }
    }

    if (!is_comment) {
      // Not a comment line; return the start of this line (before whitespace)
      return line_start;
    }

    // This is a comment line; skip to end of line
    lines_skipped++;
    while (offset < len && buf[offset] != '\n' && buf[offset] != '\r') {
      offset++;
    }

    // Skip line ending (LF, CR, or CRLF)
    if (offset < len) {
      if (buf[offset] == '\r') {
        offset++;
        if (offset < len && buf[offset] == '\n') {
          offset++;
        }
      } else if (buf[offset] == '\n') {
        offset++;
      }
    }
  }

  // All lines were comments; return end of buffer
  return offset;
}

} // namespace libvroom

Updated on 2026-02-16 at 19:19:38 +0000