Skip to content

Commit 2ed35ff

Browse files
committed
Expose API for running core's string detection on arbitrary buffers
This allows running string detection over an entire shared cache without having to load all of its files into a binary view.
1 parent cbab54c commit 2ed35ff

6 files changed

Lines changed: 373 additions & 1 deletion

File tree

binaryninjaapi.h

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5508,6 +5508,63 @@ namespace BinaryNinja {
55085508
static DerivedString FromAPIObject(BNDerivedString* str, bool owned);
55095509
};
55105510

5511+
/*! Parameters controlling raw string detection, as used by the core strings analysis.
5512+
5513+
\see StringDetector
5514+
*/
5515+
struct StringDetectionParameters
5516+
{
5517+
size_t minStringLength = 4;
5518+
bool utf8Enabled = true;
5519+
bool utf16Enabled = true;
5520+
bool utf32Enabled = true;
5521+
std::vector<std::string> unicodeBlockNames;
5522+
5523+
/*! Builds parameters from the standard string-analysis settings:
5524+
"analysis.limits.minStringLength" and "analysis.unicode.{blocks,utf8,utf16,utf32}".
5525+
5526+
\param settings Settings instance to query, e.g. \c Settings::Instance()
5527+
\param view Optional view for view-scoped setting values
5528+
\return Parameters reflecting the given settings
5529+
*/
5530+
static StringDetectionParameters FromSettings(Ref<Settings> settings, Ref<BinaryView> view = nullptr);
5531+
};
5532+
5533+
/*! A compiled string detector using the same detection logic as the core strings analysis.
5534+
5535+
The detector is immutable once constructed, so a single instance may be shared across threads.
5536+
*/
5537+
class StringDetector
5538+
{
5539+
BNStringDetector* m_object;
5540+
5541+
public:
5542+
explicit StringDetector(const StringDetectionParameters& params);
5543+
~StringDetector();
5544+
StringDetector(const StringDetector&) = delete;
5545+
StringDetector& operator=(const StringDetector&) = delete;
5546+
StringDetector(StringDetector&& other) noexcept;
5547+
StringDetector& operator=(StringDetector&& other) noexcept;
5548+
5549+
/*! Detects strings in a raw data buffer.
5550+
5551+
Strings must start within the first \c blockLen bytes of \c data but may extend up to
5552+
\c dataLen bytes, allowing large buffers to be scanned in chunks with a
5553+
\c BN_MAX_STRING_LENGTH overlap tail. \c lastFoundString (optional, in/out,
5554+
zero-initialized before the first call) carries overlap state across consecutive chunk
5555+
calls so strings spanning a chunk boundary are not reported twice.
5556+
5557+
\param data Buffer to scan
5558+
\param dataLen Total number of valid bytes in \c data
5559+
\param blockLen Number of bytes within which strings may start
5560+
\param baseAddress Address reported for offset 0 of \c data
5561+
\param lastFoundString Optional cross-chunk overlap state
5562+
\return The strings found, with addresses relative to \c baseAddress
5563+
*/
5564+
std::vector<BNStringReference> DetectStrings(const void* data, size_t dataLen, size_t blockLen,
5565+
uint64_t baseAddress, BNStringReference* lastFoundString = nullptr) const;
5566+
};
5567+
55115568
struct QualifiedNameAndType;
55125569
struct PossibleValueSet;
55135570
class Metadata;

binaryninjacore.h

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,7 @@ extern "C"
348348
typedef struct BNLineFormatter BNLineFormatter;
349349
typedef struct BNRenderLayer BNRenderLayer;
350350
typedef struct BNStringRef BNStringRef;
351+
typedef struct BNStringDetector BNStringDetector;
351352
typedef struct BNIndirectBranchInfo BNIndirectBranchInfo;
352353
typedef struct BNArchitectureAndAddress BNArchitectureAndAddress;
353354
typedef struct BNConstantRenderer BNConstantRenderer;
@@ -5718,6 +5719,32 @@ extern "C"
57185719
BNBinaryView* view, uint64_t start, uint64_t len, size_t* count);
57195720
BINARYNINJACOREAPI void BNFreeStringReferenceList(BNStringReference* strings);
57205721

5722+
typedef struct BNStringDetectionParameters
5723+
{
5724+
size_t minStringLength;
5725+
bool utf8Enabled;
5726+
bool utf16Enabled;
5727+
bool utf32Enabled;
5728+
// Unicode block names as accepted by the "analysis.unicode.blocks" setting.
5729+
const char* const* unicodeBlockNames;
5730+
size_t unicodeBlockNameCount;
5731+
} BNStringDetectionParameters;
5732+
5733+
// A compiled string detector. Immutable once created, so a single instance may be shared
5734+
// across threads. Free with BNFreeStringDetector.
5735+
BINARYNINJACOREAPI BNStringDetector* BNCreateStringDetector(const BNStringDetectionParameters* params);
5736+
BINARYNINJACOREAPI void BNFreeStringDetector(BNStringDetector* detector);
5737+
5738+
// Detects strings starting within the first blockLen bytes of data. Strings may extend up to
5739+
// dataLen bytes, allowing callers to scan large buffers in chunks with a BN_MAX_STRING_LENGTH
5740+
// overlap tail. lastFoundString (optional, in/out, zero-initialize before the first call)
5741+
// carries overlap state across consecutive calls so strings spanning a chunk boundary are not
5742+
// reported twice. Result addresses are relative to baseAddress. Free the result with
5743+
// BNFreeStringReferenceList.
5744+
BINARYNINJACOREAPI BNStringReference* BNStringDetectorDetectStrings(BNStringDetector* detector,
5745+
const uint8_t* data, size_t dataLen, size_t blockLen, uint64_t baseAddress,
5746+
BNStringReference* lastFoundString, size_t* count);
5747+
57215748
BINARYNINJACOREAPI BNDerivedString* BNGetDerivedStrings(BNBinaryView* view, size_t* count);
57225749
BINARYNINJACOREAPI BNReferenceSource* BNGetDerivedStringCodeReferences(
57235750
BNBinaryView* view, BNDerivedString* str, size_t* count, bool limit, size_t maxItems);

binaryview.cpp

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4050,6 +4050,74 @@ vector<BNStringReference> BinaryView::GetStrings(uint64_t start, uint64_t len)
40504050
}
40514051

40524052

4053+
StringDetectionParameters StringDetectionParameters::FromSettings(Ref<Settings> settings, Ref<BinaryView> view)
4054+
{
4055+
StringDetectionParameters params;
4056+
params.minStringLength = settings->Get<uint64_t>("analysis.limits.minStringLength", view);
4057+
params.utf8Enabled = settings->Get<bool>("analysis.unicode.utf8", view);
4058+
params.utf16Enabled = settings->Get<bool>("analysis.unicode.utf16", view);
4059+
params.utf32Enabled = settings->Get<bool>("analysis.unicode.utf32", view);
4060+
params.unicodeBlockNames = settings->Get<vector<string>>("analysis.unicode.blocks", view);
4061+
return params;
4062+
}
4063+
4064+
4065+
StringDetector::StringDetector(const StringDetectionParameters& params)
4066+
{
4067+
BNStringDetectionParameters apiParams;
4068+
apiParams.minStringLength = params.minStringLength;
4069+
apiParams.utf8Enabled = params.utf8Enabled;
4070+
apiParams.utf16Enabled = params.utf16Enabled;
4071+
apiParams.utf32Enabled = params.utf32Enabled;
4072+
vector<const char*> blockNames;
4073+
blockNames.reserve(params.unicodeBlockNames.size());
4074+
for (const auto& name : params.unicodeBlockNames)
4075+
blockNames.push_back(name.c_str());
4076+
apiParams.unicodeBlockNames = blockNames.data();
4077+
apiParams.unicodeBlockNameCount = blockNames.size();
4078+
m_object = BNCreateStringDetector(&apiParams);
4079+
}
4080+
4081+
4082+
StringDetector::~StringDetector()
4083+
{
4084+
if (m_object)
4085+
BNFreeStringDetector(m_object);
4086+
}
4087+
4088+
4089+
StringDetector::StringDetector(StringDetector&& other) noexcept : m_object(other.m_object)
4090+
{
4091+
other.m_object = nullptr;
4092+
}
4093+
4094+
4095+
StringDetector& StringDetector::operator=(StringDetector&& other) noexcept
4096+
{
4097+
if (this != &other)
4098+
{
4099+
if (m_object)
4100+
BNFreeStringDetector(m_object);
4101+
m_object = other.m_object;
4102+
other.m_object = nullptr;
4103+
}
4104+
return *this;
4105+
}
4106+
4107+
4108+
vector<BNStringReference> StringDetector::DetectStrings(const void* data, size_t dataLen, size_t blockLen,
4109+
uint64_t baseAddress, BNStringReference* lastFoundString) const
4110+
{
4111+
size_t count = 0;
4112+
BNStringReference* strings = BNStringDetectorDetectStrings(m_object, static_cast<const uint8_t*>(data),
4113+
dataLen, blockLen, baseAddress, lastFoundString, &count);
4114+
vector<BNStringReference> result;
4115+
result.insert(result.end(), strings, strings + count);
4116+
BNFreeStringReferenceList(strings);
4117+
return result;
4118+
}
4119+
4120+
40534121
vector<DerivedString> BinaryView::GetDerivedStrings()
40544122
{
40554123
size_t count;

python/binaryview.py

Lines changed: 83 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
import uuid
3333
from typing import Callable, Generator, Optional, Union, Tuple, List, Sequence, Mapping, Any, \
3434
Iterator, Iterable, KeysView, ItemsView, ValuesView, Dict, overload
35-
from dataclasses import dataclass
35+
from dataclasses import dataclass, field
3636
from enum import IntFlag
3737

3838
import collections
@@ -526,6 +526,88 @@ def view(self) -> 'BinaryView':
526526
return self._view
527527

528528

529+
@dataclass
530+
class StringDetectionParameters:
531+
"""Parameters controlling raw string detection, as used by the core strings analysis."""
532+
min_string_length: int = 4
533+
utf8_enabled: bool = True
534+
utf16_enabled: bool = True
535+
utf32_enabled: bool = True
536+
unicode_block_names: List[str] = field(default_factory=list)
537+
538+
@classmethod
539+
def from_settings(
540+
cls, settings_obj: Optional['settings.Settings'] = None, view: Optional['BinaryView'] = None
541+
) -> 'StringDetectionParameters':
542+
"""
543+
``from_settings`` builds parameters from the standard string-analysis settings:
544+
``analysis.limits.minStringLength`` and ``analysis.unicode.{blocks,utf8,utf16,utf32}``.
545+
"""
546+
if settings_obj is None:
547+
settings_obj = settings.Settings()
548+
return cls(
549+
min_string_length=settings_obj.get_integer("analysis.limits.minStringLength", view),
550+
utf8_enabled=settings_obj.get_bool("analysis.unicode.utf8", view),
551+
utf16_enabled=settings_obj.get_bool("analysis.unicode.utf16", view),
552+
utf32_enabled=settings_obj.get_bool("analysis.unicode.utf32", view),
553+
unicode_block_names=settings_obj.get_string_list("analysis.unicode.blocks", view)
554+
)
555+
556+
557+
@dataclass(frozen=True)
558+
class DetectedString:
559+
"""A string detected by :py:func:`detect_strings_in_block`. ``start`` is relative to the
560+
``base_address`` passed to the detector, and ``length`` is in bytes."""
561+
type: StringType
562+
start: int
563+
length: int
564+
565+
566+
def detect_strings_in_block(
567+
data: bytes, base_address: int = 0, parameters: Optional[StringDetectionParameters] = None
568+
) -> List[DetectedString]:
569+
"""
570+
``detect_strings_in_block`` detects strings in a raw data buffer using the same detection logic
571+
as the core strings analysis. Unlike :py:meth:`BinaryView.get_strings`, the data does not need to
572+
be part of a BinaryView.
573+
574+
:param data: Buffer to scan
575+
:param base_address: Address reported for offset 0 of ``data``
576+
:param parameters: Detection parameters; defaults to the current global settings
577+
:return: The strings detected
578+
"""
579+
if parameters is None:
580+
parameters = StringDetectionParameters.from_settings()
581+
582+
params = core.BNStringDetectionParameters()
583+
params.minStringLength = parameters.min_string_length
584+
params.utf8Enabled = parameters.utf8_enabled
585+
params.utf16Enabled = parameters.utf16_enabled
586+
params.utf32Enabled = parameters.utf32_enabled
587+
block_names = (ctypes.c_char_p * len(parameters.unicode_block_names))()
588+
for i, name in enumerate(parameters.unicode_block_names):
589+
block_names[i] = core.cstr(name)
590+
params.unicodeBlockNames = ctypes.cast(block_names, ctypes.POINTER(ctypes.c_char_p))
591+
params.unicodeBlockNameCount = len(parameters.unicode_block_names)
592+
593+
detector = core.BNCreateStringDetector(params)
594+
assert detector is not None, "core.BNCreateStringDetector returned None"
595+
result = []
596+
try:
597+
buf = (ctypes.c_ubyte * len(data)).from_buffer_copy(data)
598+
count = ctypes.c_ulonglong()
599+
strings = core.BNStringDetectorDetectStrings(detector, buf, len(data), len(data), base_address, None, count)
600+
assert strings is not None, "core.BNStringDetectorDetectStrings returned None"
601+
try:
602+
for i in range(count.value):
603+
result.append(DetectedString(StringType(strings[i].type), strings[i].start, strings[i].length))
604+
finally:
605+
core.BNFreeStringReferenceList(strings)
606+
finally:
607+
core.BNFreeStringDetector(detector)
608+
return result
609+
610+
529611
class StringRef:
530612
"""Deduplicated reference to a string owned by the Binary Ninja core. Use `str` or `bytes` to convert
531613
this to a standard Python string or sequence of bytes."""

rust/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ pub mod section;
7979
pub mod segment;
8080
pub mod settings;
8181
pub mod string;
82+
pub mod string_detection;
8283
pub mod symbol;
8384
pub mod tags;
8485
pub mod template_simplifier;

0 commit comments

Comments
 (0)