FIX: Inconsistent retrieval of CP1252 encoded data in VARCHAR columns - Windows vs. Linux #468#495
FIX: Inconsistent retrieval of CP1252 encoded data in VARCHAR columns - Windows vs. Linux #468#495subrata-ms wants to merge 13 commits intomainfrom
Conversation
📊 Code Coverage Report
Diff CoverageDiff: main...HEAD, staged and unstaged changes
Summary
mssql_python/pybind/ddbc_bindings.cppLines 2062-2070 2062 size_t chunkBytes = DAE_CHUNK_SIZE;
2063 while (offset < totalBytes) {
2064 size_t len = std::min(chunkBytes, totalBytes - offset);
2065
! 2066 rc = putData((SQLPOINTER)(dataPtr + offset), static_cast<SQLLEN>(len));
2067 if (!SQL_SUCCEEDED(rc)) {
2068 LOG("SQLExecute: SQLPutData failed for "
2069 "SQL_C_CHAR chunk - offset=%zu",
2070 offset, totalBytes, len, rc);Lines 3412-3425 3412 "length=%lu",
3413 i, (unsigned long)numCharsInData);
3414 } else {
3415 // Buffer too small, fallback to streaming
! 3416 LOG("SQLGetData: CHAR column %d (WCHAR path) data "
! 3417 "truncated, using streaming LOB",
! 3418 i);
! 3419 row.append(FetchLobColumnData(hStmt, i, SQL_C_WCHAR, true, false,
! 3420 "utf-16le"));
! 3421 }
3422 } else if (dataLen == SQL_NULL_DATA) {
3423 LOG("SQLGetData: Column %d is NULL (CHAR via WCHAR)", i);
3424 row.append(py::none());
3425 } else if (dataLen == 0) {Lines 3428-3447 3428 // Driver cannot report total length up front; this is
3429 // NOT a NULL value. Fall back to streaming via
3430 // FetchLobColumnData (repeated SQLGetData chunks) so
3431 // we don't silently lose data.
! 3432 LOG("SQLGetData: SQL_NO_TOTAL for column %d (CHAR via WCHAR), "
! 3433 "streaming via FetchLobColumnData",
! 3434 i);
! 3435 row.append(
! 3436 FetchLobColumnData(hStmt, i, SQL_C_WCHAR, true, false, "utf-16le"));
! 3437 } else if (dataLen < 0) {
! 3438 LOG("SQLGetData: Unexpected negative data length "
! 3439 "for column %d - dataType=%d, dataLen=%ld",
! 3440 i, dataType, (long)dataLen);
! 3441 ThrowStdException("SQLGetData returned an unexpected negative "
! 3442 "data length");
! 3443 }
3444 } else {
3445 // Surface driver errors instead of silently returning NULL.
3446 // Returning py::none() here would be indistinguishable from
3447 // a genuine SQL NULL value to the Python caller and is aLines 3445-3458 3445 // Surface driver errors instead of silently returning NULL.
3446 // Returning py::none() here would be indistinguishable from
3447 // a genuine SQL NULL value to the Python caller and is a
3448 // data-integrity risk.
! 3449 LOG_ERROR("SQLGetData: Error retrieving data for column %d "
! 3450 "(CHAR via WCHAR) - SQLRETURN=%d",
! 3451 i, ret);
! 3452 ThrowStdException("SQLGetData failed for CHAR/VARCHAR column "
! 3453 "fetched as SQL_C_WCHAR");
! 3454 }
3455 } else {
3456 // Allocate columnSize * 4 + 1 on ALL platforms (no #if guard).
3457 //
3458 // Why this differs from SQLBindColums / FetchBatchData:Lines 3517-3529 3517 // Driver cannot report total length up front; this is
3518 // NOT a NULL value. Fall back to streaming via
3519 // FetchLobColumnData (repeated SQLGetData chunks) so
3520 // we don't silently lose data.
! 3521 LOG("SQLGetData: SQL_NO_TOTAL for column %d (SQL_CHAR), "
! 3522 "streaming via FetchLobColumnData",
3523 i);
! 3524 row.append(FetchLobColumnData(hStmt, i, SQL_C_CHAR, false, false,
! 3525 effectiveCharEnc));
3526 } else if (dataLen < 0) {
3527 LOG("SQLGetData: Unexpected negative data length "
3528 "for column %d - dataType=%d, dataLen=%ld",
3529 i, dataType, (long)dataLen);Lines 3534-3545 3534 // Surface driver errors instead of silently returning NULL.
3535 // Returning py::none() here would be indistinguishable from
3536 // a genuine SQL NULL value to the Python caller and is a
3537 // data-integrity risk.
! 3538 LOG_ERROR("SQLGetData: Error retrieving data for column %d "
! 3539 "(SQL_CHAR) - SQLRETURN=%d",
! 3540 i, ret);
! 3541 ThrowStdException("SQLGetData failed for SQL_CHAR/VARCHAR column");
3542 }
3543 }
3544 break;
3545 }Lines 3602-3614 3602 // Driver cannot report total length up front; this is
3603 // NOT a NULL value. Fall back to streaming via
3604 // FetchLobColumnData (repeated SQLGetData chunks) so
3605 // we don't silently lose data.
! 3606 LOG("SQLGetData: SQL_NO_TOTAL for column %d (NVARCHAR), "
! 3607 "streaming via FetchLobColumnData",
3608 i);
! 3609 row.append(
! 3610 FetchLobColumnData(hStmt, i, SQL_C_WCHAR, true, false, "utf-16le"));
3611 } else if (dataLen < 0) {
3612 LOG("SQLGetData: Unexpected negative data length "
3613 "for column %d (NVARCHAR) - dataLen=%ld",
3614 i, (long)dataLen);Lines 3619-3630 3619 // Surface driver errors instead of silently returning NULL.
3620 // Returning py::none() here would be indistinguishable from
3621 // a genuine SQL NULL value to the Python caller and is a
3622 // data-integrity risk.
! 3623 LOG_ERROR("SQLGetData: Error retrieving data for column %d "
! 3624 "(NVARCHAR) - SQLRETURN=%d",
! 3625 i, ret);
! 3626 ThrowStdException("SQLGetData failed for NVARCHAR column");
3627 }
3628 }
3629 break;
3630 }Lines 4995-5005 4995 arrowColumnProducer->ptrValueBuffer = arrowColumnProducer->bitVal.get();
4996 break;
4997 default:
4998 std::ostringstream errorString;
! 4999 errorString << "Unsupported data type for Arrow batch fetch for column - "
! 5000 << columnName.c_str() << ", Type - " << dataType << ", column ID - "
! 5001 << (i + 1);
5002 LOG(errorString.str().c_str());
5003 ThrowStdException(errorString.str());
5004 break;
5005 }Lines 5255-5264 5255 buffers.datetimeoffsetBuffers[idxCol].data(),
5256 sizeof(DateTimeOffset),
5257 buffers.indicators[idxCol].data());
5258 if (!SQL_SUCCEEDED(ret)) {
! 5259 LOG("Error fetching SS_TIMESTAMPOFFSET data for column %d",
! 5260 idxCol + 1);
5261 return ret;
5262 }
5263 break;
5264 }Lines 5306-5316 5306 nullCounts[idxCol] += 1;
5307 continue;
5308 } else if (indicator < 0) {
5309 // Negative value is unexpected, log column index, SQL type & raise exception
! 5310 LOG("Unexpected negative data length. Column ID - %d, SQL Type - %d, Data "
! 5311 "Length - %lld",
! 5312 idxCol + 1, dataType, (long long)indicator);
5313 ThrowStdException("Unexpected negative data length.");
5314 }
5315 auto dataLen = static_cast<uint64_t>(indicator);Lines 5609-5618 5609 arrowSchemaBatchCapsule =
5610 py::capsule(arrowSchemaBatch.get(), "arrow_schema", [](void* ptr) {
5611 auto arrowSchema = static_cast<ArrowSchema*>(ptr);
5612 if (arrowSchema->release) {
! 5613 arrowSchema->release(arrowSchema);
! 5614 }
5615 delete arrowSchema;
5616 });
5617 } catch (...) {
5618 arrowSchemaBatch->release(arrowSchemaBatch.get());mssql_python/pybind/ddbc_bindings.h📋 Files Needing Attention📉 Files with overall lowest coverage (click to expand)mssql_python.pybind.logger_bridge.cpp: 59.2%
mssql_python.pybind.ddbc_bindings.h: 66.5%
mssql_python.row.py: 70.5%
mssql_python.pybind.logger_bridge.hpp: 70.8%
mssql_python.pybind.ddbc_bindings.cpp: 74.3%
mssql_python.pybind.connection.connection.cpp: 76.2%
mssql_python.__init__.py: 77.3%
mssql_python.ddbc_bindings.py: 79.6%
mssql_python.pybind.connection.connection_pool.cpp: 79.6%
mssql_python.connection.py: 85.2%🔗 Quick Links
|
There was a problem hiding this comment.
Pull request overview
This PR changes the default decoding path for SQL CHAR/VARCHAR (ODBC SQL_CHAR) columns to fetch as wide characters (SQL_C_WCHAR) and decode as UTF-16LE, eliminating Windows-vs-Linux inconsistencies when server code pages contain bytes that are invalid UTF-8.
Changes:
- Update connection defaults so
SQL_CHARdecoding usesencoding="utf-16le"withctype=SQL_WCHAR. - Plumb
charCtypethrough cursor fetch APIs into the C++ bindings, enabling wide-char fetch forSQL_CHARcolumns. - Expand/adjust encoding tests to validate new defaults and reproduce the CP1252 byte behavior.
Reviewed changes
Copilot reviewed 5 out of 5 changed files in this pull request and generated 4 comments.
Show a summary per file
| File | Description |
|---|---|
mssql_python/connection.py |
Changes default decoding settings for SQL_CHAR to UTF-16LE/SQL_WCHAR. |
mssql_python/cursor.py |
Passes updated decoding settings (encoding + ctype) into the native fetch functions. |
mssql_python/pybind/ddbc_bindings.cpp |
Adds charCtype plumb-through and implements SQL_C_WCHAR paths for SQLGetData and bound-column fetching. |
mssql_python/pybind/ddbc_bindings.h |
Extends per-column metadata and adds a wide-char branch in ProcessChar. |
tests/test_013_encoding_decoding.py |
Updates default expectations and adds Windows-focused regression tests for the CP1252 byte case. |
💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.
sumitmsft
left a comment
There was a problem hiding this comment.
Left some minor comments. Otherwise, I am ok with this change
| py::arg("StatementHandle"), py::arg("row"), py::arg("charEncoding") = "utf-8", | ||
| py::arg("wcharEncoding") = "utf-16le"); | ||
| py::arg("wcharEncoding") = "utf-16le", py::arg("charCtype") = SQL_C_WCHAR); | ||
| m.def("DDBCSQLFetchMany", &FetchMany_wrap, py::arg("StatementHandle"), py::arg("rows"), | ||
| py::arg("fetchSize"), py::arg("charEncoding") = "utf-8", | ||
| py::arg("wcharEncoding") = "utf-16le", "Fetch many rows from the result set"); | ||
| py::arg("wcharEncoding") = "utf-16le", py::arg("charCtype") = SQL_C_WCHAR, | ||
| "Fetch many rows from the result set"); | ||
| m.def("DDBCSQLFetchAll", &FetchAll_wrap, "Fetch all rows from the result set", | ||
| py::arg("StatementHandle"), py::arg("rows"), py::arg("charEncoding") = "utf-8", |
There was a problem hiding this comment.
The pybind11 default for charEncoding is still "utf-8" here, but this PR changed the connection-level default to "utf-16le" everywhere else (Connection.init, setdecoding(), all three cursor fallbacks in cursor.py). I think these three defaults should be "utf-16le" to stay consistent with the rest of the PR., just veify this once?
Work Item / Issue Reference
Summary
This pull request updates the default handling of SQL
CHAR/VARCHARcolumns to use UTF-16 (wide character) encoding instead of UTF-8, primarily to address encoding mismatches on Windows and ensure consistent Unicode decoding. The changes span the connection, cursor, and C++ binding layers, and update related tests to reflect the new default behavior.Default Encoding and Decoding Changes:
The default decoding for SQL
CHARcolumns is now set to use"utf-16le"encoding and theSQL_WCHARctype, replacing the previous"utf-8"/SQL_CHARdefaults. This avoids issues where Windows ODBC drivers return raw bytes in the server's native code page, which may not decode as UTF-8. (mssql_python/connection.py, mssql_python/connection.pyR264-R271)All cursor fetch methods (
fetchone,fetchmany,fetchall) are updated to request UTF-16 decoding and pass the correct ctype when fetchingCHARdata, ensuring consistent behavior across platforms. (mssql_python/cursor.py, [1] [2] [3]C++ Binding and Processing Updates:
ColumnInfoExtstruct now tracks whether wide character (UTF-16) fetching is used for a column, and theProcessCharfunction is updated to handle both wide and narrow character paths, decoding appropriately based on the new setting. (mssql_python/pybind/ddbc_bindings.h, [1] [2] [3]Test Adjustments:
"utf-16le"andSQL_WCHARas the default decoding settings forSQL_CHARcolumns, and to validate the new default behavior. (tests/test_013_encoding_decoding.py, [1] [2] [3]