Skip to content

Commit 414b7f1

Browse files
authored
[lang] add support for python (ttiimm#15)
Tested with https://github.com/nicolargo/glances since it is a relatively complex pyton program that generates a log. Files: * Cargo.toml: Add python tree-sitter crate. * Tasks.md: Use task markers so we can check them off. * lib.rs: Refactor the placeholder regex and add python to SourceLanguage. * source_ref.rs: The `escape_ignore_newlines()` needs to convert octal escapes to hex. Also, handle raw strings since the backslashes need to be escaped.
1 parent ffc9147 commit 414b7f1

7 files changed

Lines changed: 311 additions & 73 deletions

File tree

Cargo.lock

Lines changed: 11 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ tree-sitter = "0.25.3"
1717
tree-sitter-cpp = "0.23.4"
1818
tree-sitter-rust-orchard = "0.12.0"
1919
tree-sitter-java = "0.23.5"
20+
tree-sitter-python = "0.25.0"
2021
rayon = "1.11.0"
2122
miette = { version = "7.6.0", features = ["fancy"] }
2223

docs/Tasks.md

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,23 @@
11
# Tasks
22

3-
1. Handle running with no log format.
4-
1. Extract a thread id from log when available and associate with source ref.
5-
1. Generate call stack from exceptions.
6-
1. Support multiple source roots from CLI.
7-
1. Serialize state for re-use on subsequent executions
3+
- [ ] Handle running CLI with no log format.
4+
- TSS: Doesn't this work already? I echo
5+
the body of the log message into log2src
6+
and it can find the message.
7+
- [ ] Extract a thread id from log when available and associate with source ref.
8+
- [ ] Generate call stack from exceptions.
9+
- [ ] Support multiple source roots from CLI.
10+
- [ ] Serialize state for re-use on subsequent executions
811

912
## Extension
1013

11-
1. Work with non .log extension (.json, etc).
12-
1. Basic test coverage
13-
1. Support src -> log breakpoints
14+
- [ ] Work with non .log extension (.json, etc).
15+
- [ ] Basic test coverage
16+
- [ ] Support src -> log breakpoints
1417

1518
## Languages
1619

17-
1. Python
18-
1. Go
19-
1. JavaScript
20+
- [X] Python
21+
- [ ] Go
22+
- [ ] JavaScript
23+
- [ ] Typescript

src/lib.rs

Lines changed: 94 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
use itertools::Itertools;
22
use miette::Diagnostic;
33
use rayon::prelude::*;
4-
use regex::RegexSet;
4+
use regex::{Captures, Regex, RegexSet};
55
use serde::Serialize;
66
use std::collections::HashMap;
77
use std::ffi::OsStr;
88
use std::fs::File;
99
use std::io;
10-
use std::ops::RangeBounds;
10+
use std::ops::{Deref, RangeBounds};
1111
use std::path::{Path, PathBuf};
12-
use std::sync::Arc;
12+
use std::sync::{Arc, LazyLock};
1313
use thiserror::Error;
1414
use tree_sitter::Language;
1515

@@ -254,6 +254,7 @@ pub enum SourceLanguage {
254254
Java,
255255
#[serde(rename = "C++")]
256256
Cpp,
257+
Python,
257258
}
258259

259260
impl From<SourceLanguage> for Language {
@@ -262,6 +263,7 @@ impl From<SourceLanguage> for Language {
262263
SourceLanguage::Rust => tree_sitter_rust_orchard::LANGUAGE.into(),
263264
SourceLanguage::Java => tree_sitter_java::LANGUAGE.into(),
264265
SourceLanguage::Cpp => tree_sitter_cpp::LANGUAGE.into(),
266+
SourceLanguage::Python => tree_sitter_python::LANGUAGE.into(),
265267
}
266268
}
267269
}
@@ -270,12 +272,30 @@ const IDENTS_RS: &[&str] = &["debug", "info", "warn"];
270272
const IDENTS_JAVA: &[&str] = &["logger", "log", "fine", "debug", "info", "warn", "trace"];
271273
const IDENTS_CPP: &[&str] = &["debug", "info", "warn", "trace"];
272274

275+
const IDENTS_PYTHON: &[&str] = &["debug", "info", "warn", "trace"];
276+
277+
static RUST_PLACEHOLDER_REGEX: LazyLock<Regex> = LazyLock::new(|| {
278+
Regex::new(r#"\{(?:([a-zA-Z_][a-zA-Z0-9_.]*)|(\d+))?\s*(?::[^}]*)?}"#).unwrap()
279+
});
280+
281+
static JAVA_PLACEHOLDER_REGEX: LazyLock<Regex> =
282+
LazyLock::new(|| Regex::new(r#"\{.*}|\\\{(.*)}"#).unwrap());
283+
284+
static CPP_PLACEHOLDER_REGEX: LazyLock<Regex> = LazyLock::new(|| {
285+
Regex::new(r#"%[-+ #0]*\d*(?:\.\d+)?[hlLzjt]*[diuoxXfFeEgGaAcspn%]|\{(?:([a-zA-Z_][a-zA-Z0-9_.]*)|(\d+))?\s*(?::[^}]*)?}"#).unwrap()
286+
});
287+
288+
static PYTHON_PLACEHOLDER_REGEX: LazyLock<Regex> = LazyLock::new(|| {
289+
Regex::new(r#"%[-+ #0]*\d*(?:\.\d+)?[hlLzjt]*[diuoxXfFeEgGaAcspn%]"#).unwrap()
290+
});
291+
273292
impl SourceLanguage {
274293
pub fn as_str(&self) -> &'static str {
275294
match self {
276295
SourceLanguage::Rust => "Rust",
277296
SourceLanguage::Java => "Java",
278297
SourceLanguage::Cpp => "C++",
298+
SourceLanguage::Python => "Python",
279299
}
280300
}
281301

@@ -284,6 +304,7 @@ impl SourceLanguage {
284304
Some("rs") => Some(Self::Rust),
285305
Some("java") => Some(Self::Java),
286306
Some("h" | "hh" | "hpp" | "hxx" | "tpp" | "cc" | "cpp" | "cxx") => Some(Self::Cpp),
307+
Some("py") => Some(Self::Python),
287308
None | Some(_) => None,
288309
}
289310
}
@@ -339,6 +360,20 @@ impl SourceLanguage {
339360
)
340361
"#
341362
}
363+
SourceLanguage::Python => {
364+
r#"
365+
(
366+
(expression_statement
367+
(call
368+
function: (_) @func
369+
arguments: (argument_list .
370+
(string) @args
371+
)
372+
)
373+
)
374+
)
375+
"#
376+
}
342377
}
343378
}
344379

@@ -347,7 +382,34 @@ impl SourceLanguage {
347382
SourceLanguage::Rust => IDENTS_RS,
348383
SourceLanguage::Java => IDENTS_JAVA,
349384
SourceLanguage::Cpp => IDENTS_CPP,
385+
SourceLanguage::Python => IDENTS_PYTHON,
386+
}
387+
}
388+
389+
fn get_placeholder_regex(&self) -> &'static Regex {
390+
match self {
391+
SourceLanguage::Rust => RUST_PLACEHOLDER_REGEX.deref(),
392+
SourceLanguage::Java => JAVA_PLACEHOLDER_REGEX.deref(),
393+
SourceLanguage::Cpp => CPP_PLACEHOLDER_REGEX.deref(),
394+
SourceLanguage::Python => PYTHON_PLACEHOLDER_REGEX.deref(),
395+
}
396+
}
397+
398+
fn captures_to_format_arg(&self, caps: &Captures) -> FormatArgument {
399+
for (index, cap) in caps.iter().skip(1).enumerate() {
400+
if let Some(cap) = cap {
401+
return match (self, index) {
402+
(SourceLanguage::Rust | SourceLanguage::Java | SourceLanguage::Cpp, 0) => {
403+
FormatArgument::Named(cap.as_str().to_string())
404+
}
405+
(SourceLanguage::Rust | SourceLanguage::Cpp, 1) => {
406+
FormatArgument::Positional(cap.as_str().parse().unwrap())
407+
}
408+
_ => unreachable!(),
409+
};
410+
}
350411
}
412+
FormatArgument::Placeholder
351413
}
352414
}
353415

@@ -513,7 +575,7 @@ pub fn extract_logging_guarded(sources: &[CodeSource], guard: &WorkGuard) -> Vec
513575
for result in results {
514576
// println!("node.kind()={:?} range={:?}", result.kind, result.range);
515577
match result.kind.as_str() {
516-
"string_literal" => {
578+
"string_literal" | "string" => {
517579
if let Some(src_ref) = SourceRef::new(code, result) {
518580
patterns.push(src_ref.pattern.clone());
519581
matched.push(src_ref);
@@ -852,4 +914,32 @@ fn main() {
852914
},]
853915
);
854916
}
917+
918+
const PYTHON_SOURCE: &str = r#"
919+
def main(args):
920+
logger.info("foo %s \N{greek small letter pi}", test_var)
921+
logging.info(f'Hello, {args[1]}!')
922+
logger.warning(f"warning message:\nlow disk space")
923+
logger.info(rf"""info message:
924+
processing started -- {args[0]}""")
925+
"#;
926+
927+
#[test]
928+
fn test_basic_python() {
929+
let log_ref = LogRef::new("foo bar π");
930+
let code = CodeSource::from_string(&Path::new("in-mem.py"), PYTHON_SOURCE);
931+
let src_refs = extract_logging(&[code], &ProgressTracker::new())
932+
.pop()
933+
.unwrap()
934+
.log_statements;
935+
assert_yaml_snapshot!(src_refs);
936+
let vars = extract_variables(&log_ref, &src_refs[0]);
937+
assert_eq!(
938+
vars,
939+
vec![VariablePair {
940+
expr: "test_var".to_string(),
941+
value: "bar".to_string()
942+
},]
943+
);
944+
}
855945
}
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
---
2+
source: src/lib.rs
3+
expression: src_refs
4+
---
5+
- sourcePath: in-mem.py
6+
language: Python
7+
lineNumber: 3
8+
endLineNumber: 3
9+
column: 16
10+
name: main
11+
text: "\"foo %s \\N{greek small letter pi}\""
12+
quality: 5
13+
pattern: "(?s)^foo (.+) \\w$"
14+
args:
15+
- Placeholder
16+
vars:
17+
- test_var
18+
- sourcePath: in-mem.py
19+
language: Python
20+
lineNumber: 4
21+
endLineNumber: 4
22+
column: 17
23+
name: main
24+
text: "f'Hello, {args[1]}!'"
25+
quality: 7
26+
pattern: "(?s)^Hello, (.+)!$"
27+
args:
28+
- Named: "args[1]"
29+
vars: []
30+
- sourcePath: in-mem.py
31+
language: Python
32+
lineNumber: 5
33+
endLineNumber: 5
34+
column: 19
35+
name: main
36+
text: "f\"warning message:\\nlow disk space\""
37+
quality: 29
38+
pattern: "(?s)^warning message:\\nlow disk space$"
39+
args: []
40+
vars: []
41+
- sourcePath: in-mem.py
42+
language: Python
43+
lineNumber: 6
44+
endLineNumber: 7
45+
column: 16
46+
name: main
47+
text: "rf\"\"\"info message:\nprocessing started -- {args[0]}\"\"\""
48+
quality: 33
49+
pattern: "(?s)^info message:\\nprocessing started -- (.+)$"
50+
args:
51+
- Named: "args[0]"
52+
vars: []

0 commit comments

Comments
 (0)