Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 79 additions & 17 deletions mssql_python/pybind/ddbc_bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,19 @@
#define DAE_CHUNK_SIZE 8192
#define SQL_MAX_LOB_SIZE 8000

// Returns the effective character decoding encoding for SQL_C_CHAR data.
// On Linux/macOS, the ODBC driver always returns UTF-8 for SQL_C_CHAR,
// having already converted from the server's encoding (e.g., CP1252).
// On Windows, the driver returns bytes in the server's native encoding.
inline std::string GetEffectiveCharDecoding(const std::string& userEncoding) {
#if defined(__APPLE__) || defined(__linux__)
(void)userEncoding;
return "utf-8";
#else
return userEncoding;
#endif
}

//-------------------------------------------------------------------------------------------------
//-------------------------------------------------------------------------------------------------
// Logging Infrastructure:
Expand Down Expand Up @@ -1153,7 +1166,8 @@ void SqlHandle::markImplicitlyFreed() {
// Log error but don't throw - we're likely in cleanup/destructor path
LOG_ERROR("SAFETY VIOLATION: Attempted to mark non-STMT handle as implicitly freed. "
"Handle type=%d. This will cause handle leak. Only STMT handles are "
"automatically freed by parent DBC handles.", _type);
"automatically freed by parent DBC handles.",
_type);
return; // Refuse to mark - let normal free() handle it
}
_implicitly_freed = true;
Expand Down Expand Up @@ -2875,17 +2889,18 @@ py::object FetchLobColumnData(SQLHSTMT hStmt, SQLUSMALLINT colIndex, SQLSMALLINT
return py::bytes(buffer.data(), buffer.size());
}

// For SQL_C_CHAR data, decode using the specified encoding
// For SQL_C_CHAR data, decode using the appropriate encoding.
const std::string effectiveCharEncoding = GetEffectiveCharDecoding(charEncoding);
py::bytes raw_bytes(buffer.data(), buffer.size());
try {
py::object decoded = raw_bytes.attr("decode")(charEncoding, "strict");
py::object decoded = raw_bytes.attr("decode")(effectiveCharEncoding, "strict");
LOG("FetchLobColumnData: Decoded narrow string with '%s' - %zu bytes -> %zu chars for "
"column %d",
charEncoding.c_str(), buffer.size(), py::len(decoded), colIndex);
effectiveCharEncoding.c_str(), buffer.size(), py::len(decoded), colIndex);
return decoded;
} catch (const py::error_already_set& e) {
LOG_ERROR("FetchLobColumnData: Failed to decode with '%s' for column %d: %s",
charEncoding.c_str(), colIndex, e.what());
effectiveCharEncoding.c_str(), colIndex, e.what());
// Return raw bytes as fallback
return raw_bytes;
}
Expand Down Expand Up @@ -2941,7 +2956,23 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p
row.append(
FetchLobColumnData(hStmt, i, SQL_C_CHAR, false, false, charEncoding));
} else {
uint64_t fetchBufferSize = columnSize + 1 /* null-termination */;
// Allocate columnSize * 4 + 1 on ALL platforms (no #if guard).
//
// Why this differs from SQLBindColums / FetchBatchData:
// Those two functions use #if to apply *4 only on Linux/macOS,
// because on Windows with a non-UTF-8 collation (e.g. CP1252)
// each character occupies exactly 1 byte, so *1 suffices and
// saves memory across the entire batch (fetchSize × numCols
// buffers).
//
// SQLGetData_wrap allocates a single temporary buffer per
// column per row, so the over-allocation cost is negligible.
// Using *4 unconditionally here keeps the code simple and
// correct on every platform—including Windows with a UTF-8
// collation where multi-byte chars could otherwise cause
// truncation at the exact column boundary (e.g. CP1252 é in
// VARCHAR(10)).
uint64_t fetchBufferSize = columnSize * 4 + 1 /* null-termination */;
std::vector<SQLCHAR> dataBuffer(fetchBufferSize);
SQLLEN dataLen;
ret = SQLGetData_ptr(hStmt, i, SQL_C_CHAR, dataBuffer.data(), dataBuffer.size(),
Expand All @@ -2952,20 +2983,23 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p
uint64_t numCharsInData = dataLen / sizeof(SQLCHAR);
if (numCharsInData < dataBuffer.size()) {
// SQLGetData will null-terminate the data
// Use Python's codec system to decode bytes with specified encoding
// Use Python's codec system to decode bytes.
const std::string decodeEncoding =
GetEffectiveCharDecoding(charEncoding);
py::bytes raw_bytes(reinterpret_cast<char*>(dataBuffer.data()),
static_cast<size_t>(dataLen));
try {
py::object decoded =
raw_bytes.attr("decode")(charEncoding, "strict");
raw_bytes.attr("decode")(decodeEncoding, "strict");
row.append(decoded);
LOG("SQLGetData: CHAR column %d decoded with '%s', %zu bytes "
"-> %zu chars",
i, charEncoding.c_str(), (size_t)dataLen, py::len(decoded));
i, decodeEncoding.c_str(), (size_t)dataLen,
py::len(decoded));
} catch (const py::error_already_set& e) {
LOG_ERROR(
"SQLGetData: Failed to decode CHAR column %d with '%s': %s",
i, charEncoding.c_str(), e.what());
i, decodeEncoding.c_str(), e.what());
// Return raw bytes as fallback
row.append(raw_bytes);
}
Expand Down Expand Up @@ -3451,7 +3485,14 @@ SQLRETURN SQLBindColums(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& column
// TODO: handle variable length data correctly. This logic wont
// suffice
HandleZeroColumnSizeAtFetch(columnSize);
// Use columnSize * 4 + 1 on Linux/macOS to accommodate UTF-8
// expansion. The ODBC driver returns UTF-8 for SQL_C_CHAR where
// each character can be up to 4 bytes.
#if defined(__APPLE__) || defined(__linux__)
uint64_t fetchBufferSize = columnSize * 4 + 1 /*null-terminator*/;
#else
uint64_t fetchBufferSize = columnSize + 1 /*null-terminator*/;
#endif
// TODO: For LONGVARCHAR/BINARY types, columnSize is returned as
// 2GB-1 by SQLDescribeCol. So fetchBufferSize = 2GB.
// fetchSize=1 if columnSize>1GB. So we'll allocate a vector of
Expand Down Expand Up @@ -3598,7 +3639,8 @@ SQLRETURN SQLBindColums(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& column
// TODO: Move to anonymous namespace, since it is not used outside this file
SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& columnNames,
py::list& rows, SQLUSMALLINT numCols, SQLULEN& numRowsFetched,
const std::vector<SQLUSMALLINT>& lobColumns) {
const std::vector<SQLUSMALLINT>& lobColumns,
const std::string& charEncoding = "utf-8") {
LOG("FetchBatchData: Fetching data in batches");
SQLRETURN ret = SQLFetchScroll_ptr(hStmt, SQL_FETCH_NEXT, 0);
if (ret == SQL_NO_DATA) {
Expand Down Expand Up @@ -3628,8 +3670,22 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum
std::find(lobColumns.begin(), lobColumns.end(), col + 1) != lobColumns.end();
columnInfos[col].processedColumnSize = columnInfos[col].columnSize;
HandleZeroColumnSizeAtFetch(columnInfos[col].processedColumnSize);
// On Linux/macOS, the ODBC driver returns UTF-8 for SQL_C_CHAR where
// each character can be up to 4 bytes. Must match SQLBindColums buffer.
#if defined(__APPLE__) || defined(__linux__)
SQLSMALLINT dt = columnInfos[col].dataType;
bool isCharType = (dt == SQL_CHAR || dt == SQL_VARCHAR || dt == SQL_LONGVARCHAR);
if (isCharType) {
columnInfos[col].fetchBufferSize = columnInfos[col].processedColumnSize * 4 +
1; // *4 for UTF-8, +1 for null terminator
} else {
columnInfos[col].fetchBufferSize =
columnInfos[col].processedColumnSize + 1; // +1 for null terminator
}
#else
columnInfos[col].fetchBufferSize =
columnInfos[col].processedColumnSize + 1; // +1 for null terminator
#endif
}

// Performance: Build function pointer dispatch table (once per batch)
Expand All @@ -3639,13 +3695,18 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum
std::vector<ColumnProcessor> columnProcessors(numCols);
std::vector<ColumnInfoExt> columnInfosExt(numCols);

// Compute effective char encoding once for the batch (same for all columns)
const std::string effectiveCharEnc = GetEffectiveCharDecoding(charEncoding);

for (SQLUSMALLINT col = 0; col < numCols; col++) {
// Populate extended column info for processors that need it
columnInfosExt[col].dataType = columnInfos[col].dataType;
columnInfosExt[col].columnSize = columnInfos[col].columnSize;
columnInfosExt[col].processedColumnSize = columnInfos[col].processedColumnSize;
columnInfosExt[col].fetchBufferSize = columnInfos[col].fetchBufferSize;
columnInfosExt[col].isLob = columnInfos[col].isLob;
columnInfosExt[col].charEncoding = effectiveCharEnc;
columnInfosExt[col].isUtf8 = (effectiveCharEnc == "utf-8");

// Map data type to processor function (switch executed once per column,
// not per cell)
Expand Down Expand Up @@ -4085,7 +4146,8 @@ SQLRETURN FetchMany_wrap(SqlHandlePtr StatementHandle, py::list& rows, int fetch
SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROW_ARRAY_SIZE, (SQLPOINTER)(intptr_t)fetchSize, 0);
SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROWS_FETCHED_PTR, &numRowsFetched, 0);

ret = FetchBatchData(hStmt, buffers, columnNames, rows, numCols, numRowsFetched, lobColumns);
ret = FetchBatchData(hStmt, buffers, columnNames, rows, numCols, numRowsFetched, lobColumns,
charEncoding);
if (!SQL_SUCCEEDED(ret) && ret != SQL_NO_DATA) {
LOG("FetchMany_wrap: Error when fetching data - SQLRETURN=%d", ret);
return ret;
Expand All @@ -4094,10 +4156,10 @@ SQLRETURN FetchMany_wrap(SqlHandlePtr StatementHandle, py::list& rows, int fetch
// Reset attributes before returning to avoid using stack pointers later
SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROW_ARRAY_SIZE, (SQLPOINTER)1, 0);
SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROWS_FETCHED_PTR, NULL, 0);

// Unbind columns to allow subsequent fetchone() calls to use SQLGetData
SQLFreeStmt_ptr(hStmt, SQL_UNBIND);

return ret;
}

Expand Down Expand Up @@ -4221,8 +4283,8 @@ SQLRETURN FetchAll_wrap(SqlHandlePtr StatementHandle, py::list& rows,
SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROWS_FETCHED_PTR, &numRowsFetched, 0);

while (ret != SQL_NO_DATA) {
ret =
FetchBatchData(hStmt, buffers, columnNames, rows, numCols, numRowsFetched, lobColumns);
ret = FetchBatchData(hStmt, buffers, columnNames, rows, numCols, numRowsFetched, lobColumns,
charEncoding);
if (!SQL_SUCCEEDED(ret) && ret != SQL_NO_DATA) {
LOG("FetchAll_wrap: Error when fetching data - SQLRETURN=%d", ret);
return ret;
Expand All @@ -4232,7 +4294,7 @@ SQLRETURN FetchAll_wrap(SqlHandlePtr StatementHandle, py::list& rows,
// Reset attributes before returning to avoid using stack pointers later
SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROW_ARRAY_SIZE, (SQLPOINTER)1, 0);
SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROWS_FETCHED_PTR, NULL, 0);

// Unbind columns to allow subsequent fetchone() calls to use SQLGetData
SQLFreeStmt_ptr(hStmt, SQL_UNBIND);

Expand Down
47 changes: 38 additions & 9 deletions mssql_python/pybind/ddbc_bindings.h
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,8 @@ struct ColumnInfoExt {
SQLULEN processedColumnSize;
uint64_t fetchBufferSize;
bool isLob;
bool isUtf8; // Pre-computed from charEncoding (avoids string compare per cell)
std::string charEncoding; // Effective decoding encoding for SQL_C_CHAR data
};

// Forward declare FetchLobColumnData (defined in ddbc_bindings.cpp) - MUST be
Expand Down Expand Up @@ -811,21 +813,48 @@ inline void ProcessChar(PyObject* row, ColumnBuffers& buffers, const void* colIn
// fetchBufferSize includes null-terminator, numCharsInData doesn't. Hence
// '<'
if (!colInfo->isLob && numCharsInData < colInfo->fetchBufferSize) {
// Performance: Direct Python C API call - create string from buffer
PyObject* pyStr = PyUnicode_FromStringAndSize(
reinterpret_cast<char*>(
&buffers.charBuffers[col - 1][rowIdx * colInfo->fetchBufferSize]),
numCharsInData);
const char* dataPtr = reinterpret_cast<char*>(
&buffers.charBuffers[col - 1][rowIdx * colInfo->fetchBufferSize]);
PyObject* pyStr = nullptr;
#if defined(__APPLE__) || defined(__linux__)
// On Linux/macOS, ODBC driver returns UTF-8 — PyUnicode_FromStringAndSize
// expects UTF-8, so this is correct and fast.
pyStr = PyUnicode_FromStringAndSize(dataPtr, numCharsInData);
#else
// On Windows, ODBC driver returns bytes in the server's native encoding.
// For UTF-8, use the direct C API (PyUnicode_FromStringAndSize) which
// bypasses the codec registry for maximum reliability. For non-UTF-8
// encodings (e.g., CP1252), use PyUnicode_Decode with the codec registry.
if (colInfo->isUtf8) {
pyStr = PyUnicode_FromStringAndSize(dataPtr, numCharsInData);
} else {
pyStr =
PyUnicode_Decode(dataPtr, numCharsInData, colInfo->charEncoding.c_str(), "strict");
}
#endif
if (!pyStr) {
Py_INCREF(Py_None);
PyList_SET_ITEM(row, col - 1, Py_None);
// Decode failed — fall back to returning raw bytes (consistent with
// FetchLobColumnData and SQLGetData_wrap which also return raw bytes
// on decode failure instead of silently converting to None).
PyErr_Clear();
PyObject* pyBytes = PyBytes_FromStringAndSize(dataPtr, numCharsInData);
if (pyBytes) {
PyList_SET_ITEM(row, col - 1, pyBytes);
} else {
PyErr_Clear();
Py_INCREF(Py_None);
PyList_SET_ITEM(row, col - 1, Py_None);
}
} else {
PyList_SET_ITEM(row, col - 1, pyStr);
}
} else {
// Slow path: LOB data requires separate fetch call
PyList_SET_ITEM(row, col - 1,
FetchLobColumnData(hStmt, col, SQL_C_CHAR, false, false).release().ptr());
PyList_SET_ITEM(
row, col - 1,
FetchLobColumnData(hStmt, col, SQL_C_CHAR, false, false, colInfo->charEncoding)
.release()
.ptr());
}
}

Expand Down
1 change: 1 addition & 0 deletions tests/test_013_encoding_decoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -5697,6 +5697,7 @@ def test_default_encoding_behavior_validation(conn_str):
def test_encoding_with_bytes_and_bytearray_parameters(db_connection):
"""Test encoding with bytes and bytearray parameters (SQL_C_CHAR path)."""
db_connection.setencoding(encoding="utf-8", ctype=mssql_python.SQL_CHAR)
db_connection.setdecoding(mssql_python.SQL_CHAR, encoding="utf-8")

cursor = db_connection.cursor()
try:
Expand Down
Loading
Loading