GML-2076 router improvement#37
Conversation
…IO, remove unused endpoint
Adds an "Auto" mode that picks among the four search methods per question using deterministic rules first, falling back to an LLM call. Manual method selection still works as a transitional override.
Surfaces the auto-selection in the UI, adds Prometheus telemetry for the selector's distribution, and short-circuits to an honest "no info" answer when the chosen retriever returns nothing.
PR Reviewer Guide 🔍Here are some key observations to aid the review process:
|
| @router.get(route_prefix + "/trace/{message_id}") | ||
| def get_trace_log( | ||
| message_id: str, | ||
| creds: Annotated[tuple[list[str], HTTPBasicCredentials], Depends(ui_basic_auth)], | ||
| ): | ||
| filepath = os.path.join(TRACE_LOGS_DIR, f"{message_id}.json") | ||
| if not os.path.exists(filepath): | ||
| raise HTTPException(status_code=404, detail="Trace log not found") | ||
| with open(filepath, "r") as f: | ||
| return json.load(f) |
There was a problem hiding this comment.
Suggestion: The message_id is interpolated directly into a filesystem path, so a crafted value like ../../../tmp/x can escape TRACE_LOGS_DIR and read arbitrary JSON files. Restrict message_id to a safe filename format and verify the resolved path stays under TRACE_LOGS_DIR before opening it. [security, importance: 9]
| @router.get(route_prefix + "/trace/{message_id}") | |
| def get_trace_log( | |
| message_id: str, | |
| creds: Annotated[tuple[list[str], HTTPBasicCredentials], Depends(ui_basic_auth)], | |
| ): | |
| filepath = os.path.join(TRACE_LOGS_DIR, f"{message_id}.json") | |
| if not os.path.exists(filepath): | |
| raise HTTPException(status_code=404, detail="Trace log not found") | |
| with open(filepath, "r") as f: | |
| return json.load(f) | |
| @router.get(route_prefix + "/trace/{message_id}") | |
| def get_trace_log( | |
| message_id: str, | |
| creds: Annotated[tuple[list[str], HTTPBasicCredentials], Depends(ui_basic_auth)], | |
| ): | |
| if not re.fullmatch(r"[A-Za-z0-9_-]+", message_id): | |
| raise HTTPException(status_code=400, detail="Invalid message_id") | |
| base_dir = os.path.abspath(TRACE_LOGS_DIR) | |
| filepath = os.path.abspath(os.path.join(base_dir, f"{message_id}.json")) | |
| if os.path.commonpath([base_dir, filepath]) != base_dir: | |
| raise HTTPException(status_code=400, detail="Invalid message_id") | |
| if not os.path.exists(filepath): | |
| raise HTTPException(status_code=404, detail="Trace log not found") | |
| with open(filepath, "r") as f: | |
| return json.load(f) |
| const message = resolvedMessage || (apiData ? { | ||
| content: apiData.natural_language_response, | ||
| response_time: apiData.response_time, | ||
| response_type: apiData.response_type, | ||
| query_sources: apiData.query_sources, | ||
| } : null); | ||
| const userQuery = stateUserQuery || sessionMessage?.userQuery || apiData?.user_query; | ||
|
|
||
| const trace = useMemo( | ||
| () => buildTraceFromMessage(message, userQuery), | ||
| [message, userQuery] |
There was a problem hiding this comment.
Suggestion: When the fetch fails or no trace exists, message becomes null but buildTraceFromMessage still runs and renders a fake "completed" trace with "N/A" content. Guard against missing data and show an explicit not-found/error state instead, so users do not see misleading trace details. [possible issue, importance: 7]
| const message = resolvedMessage || (apiData ? { | |
| content: apiData.natural_language_response, | |
| response_time: apiData.response_time, | |
| response_type: apiData.response_type, | |
| query_sources: apiData.query_sources, | |
| } : null); | |
| const userQuery = stateUserQuery || sessionMessage?.userQuery || apiData?.user_query; | |
| const trace = useMemo( | |
| () => buildTraceFromMessage(message, userQuery), | |
| [message, userQuery] | |
| const message = resolvedMessage || (apiData ? { | |
| content: apiData.natural_language_response, | |
| response_time: apiData.response_time, | |
| response_type: apiData.response_type, | |
| query_sources: apiData.query_sources, | |
| } : null); | |
| const userQuery = stateUserQuery || sessionMessage?.userQuery || apiData?.user_query; | |
| const trace = useMemo( | |
| () => (message ? buildTraceFromMessage(message, userQuery) : null), | |
| [message, userQuery] | |
| ); | |
| if (!loading && !trace) { | |
| return ( | |
| <div className="min-h-screen bg-background flex items-center justify-center"> | |
| <p className="text-muted-foreground">Trace data not found.</p> | |
| </div> | |
| ); | |
| } |
| elif extension in ['.xlsx', '.xls']: | ||
| import pandas as pd | ||
| engine = 'openpyxl' if extension == '.xlsx' else 'xlrd' | ||
| try: | ||
| xl = pd.ExcelFile(file_path, engine=engine) | ||
| except Exception: | ||
| xl = pd.ExcelFile(file_path) | ||
| sheet_texts = [] | ||
| for sheet_name in xl.sheet_names: | ||
| # Always read with header=None so no data row is silently | ||
| # consumed as column names for headerless spreadsheets. | ||
| df = xl.parse(sheet_name, header=None) | ||
| if df.empty: | ||
| continue | ||
| df = df.fillna('') | ||
| # Detect header row: first row is all non-empty strings with | ||
| # no purely numeric values → treat as column names. | ||
| first_row = df.iloc[0] | ||
| if all(isinstance(v, str) and v.strip() for v in first_row): | ||
| df.columns = first_row.tolist() | ||
| df = df.iloc[1:].reset_index(drop=True) | ||
| else: | ||
| df.columns = [f"Column {i + 1}" for i in range(len(df.columns))] | ||
| sheet_md = df.to_markdown(index=False) | ||
| sheet_texts.append(f"## Sheet: {sheet_name}\n\n{sheet_md}") | ||
| return "\n\n".join(sheet_texts) if sheet_texts else "[Excel file is empty or contains no data]" |
There was a problem hiding this comment.
Suggestion: The header detection treats any non-empty string first row as headers, so ordinary string-valued first records are dropped from the extracted text. Use a stricter heuristic, such as requiring unique column names and at least one subsequent data row, before removing the first row. [possible issue, importance: 6]
| elif extension in ['.xlsx', '.xls']: | |
| import pandas as pd | |
| engine = 'openpyxl' if extension == '.xlsx' else 'xlrd' | |
| try: | |
| xl = pd.ExcelFile(file_path, engine=engine) | |
| except Exception: | |
| xl = pd.ExcelFile(file_path) | |
| sheet_texts = [] | |
| for sheet_name in xl.sheet_names: | |
| # Always read with header=None so no data row is silently | |
| # consumed as column names for headerless spreadsheets. | |
| df = xl.parse(sheet_name, header=None) | |
| if df.empty: | |
| continue | |
| df = df.fillna('') | |
| # Detect header row: first row is all non-empty strings with | |
| # no purely numeric values → treat as column names. | |
| first_row = df.iloc[0] | |
| if all(isinstance(v, str) and v.strip() for v in first_row): | |
| df.columns = first_row.tolist() | |
| df = df.iloc[1:].reset_index(drop=True) | |
| else: | |
| df.columns = [f"Column {i + 1}" for i in range(len(df.columns))] | |
| sheet_md = df.to_markdown(index=False) | |
| sheet_texts.append(f"## Sheet: {sheet_name}\n\n{sheet_md}") | |
| return "\n\n".join(sheet_texts) if sheet_texts else "[Excel file is empty or contains no data]" | |
| elif extension in ['.xlsx', '.xls']: | |
| import pandas as pd | |
| engine = 'openpyxl' if extension == '.xlsx' else 'xlrd' | |
| try: | |
| xl = pd.ExcelFile(file_path, engine=engine) | |
| except Exception: | |
| xl = pd.ExcelFile(file_path) | |
| sheet_texts = [] | |
| for sheet_name in xl.sheet_names: | |
| # Always read with header=None so no data row is silently | |
| # consumed as column names for headerless spreadsheets. | |
| df = xl.parse(sheet_name, header=None) | |
| if df.empty: | |
| continue | |
| df = df.fillna('') | |
| first_row = df.iloc[0] | |
| first_row_values = [str(v).strip() for v in first_row] | |
| looks_like_header = ( | |
| len(df) > 1 | |
| and all(first_row_values) | |
| and len(set(first_row_values)) == len(first_row_values) | |
| and not any(v.isdigit() for v in first_row_values) | |
| ) | |
| if looks_like_header: | |
| df.columns = first_row_values | |
| df = df.iloc[1:].reset_index(drop=True) | |
| else: | |
| df.columns = [f"Column {i + 1}" for i in range(len(df.columns))] | |
| sheet_md = df.to_markdown(index=False) | |
| sheet_texts.append(f"## Sheet: {sheet_name}\n\n{sheet_md}") | |
| return "\n\n".join(sheet_texts) if sheet_texts else "[Excel file is empty or contains no data]" |
PR Type
Enhancement, Documentation, Tests
Description
Add auto retriever selection flow
autoAdd trace logging and trace UI
Improve answer safety and ingestion
Update docs, version, and CI
1.4.0Diagram Walkthrough
File Walkthrough
14 files
Save and serve per-message trace logsAdd auto retriever selection and OOC guardImplement rule-based and LLM retriever chooserCapture per-node traces and token totalsAdd context-local LLM usage collection helpersImprove CSV and Excel text extractionAdd retriever method selection metricAdd full trace logs inspection pageShow retriever badge and open tracesPreserve last user query for tracesAdd role-gated View Trace actionDefault chat retrieval mode to AutoWarn about skipped and spreadsheet uploadsRegister authenticated trace page route1 files
Cover selector rules, LLM path, fallback2 files
Add tuning guideline and release updatesDocument `1.4.0` retrieval improvements2 files
Add UI packages for trace page supportAdd Excel parsing runtime dependencies7 files
Proxy trace UI routes through nginxMount persistent trace logs volumeSkip nightly cloud builds on docs changesSkip cloud CI builds on docs changesSkip nightly on-prem builds on docs changesSkip on-prem builds on docs changesBump project version to `1.4.0`