diff --git a/common/requirements.txt b/common/requirements.txt index 0a7c34f..f4d5ac6 100644 --- a/common/requirements.txt +++ b/common/requirements.txt @@ -105,6 +105,8 @@ nest-asyncio==1.6.0 nltk==3.9.1 numpy>=1, <2 openai==1.92.2 +openpyxl>=3.1.0 +xlrd>=2.0.1 ordered-set==4.1.0 orjson==3.10.18 packaging==24.2 diff --git a/common/utils/text_extractors.py b/common/utils/text_extractors.py index 449ace5..82442ba 100644 --- a/common/utils/text_extractors.py +++ b/common/utils/text_extractors.py @@ -290,7 +290,7 @@ async def process_with_semaphore(file_path): 'error': result.get('error', 'Unknown error') }) - logger.info(f"Prepared {len(processed_files_info)} files ({len(jsonl_files_copied)} JSONL copied, {len(files_to_process)} converted), {total_docs} total documents") + logger.info(f"Processed {len(processed_files_info)} files, extracted {total_docs} total documents") logger.info(f"Created {len([f for f in processed_files_info if f.get('status') == 'success'])} JSONL files in {temp_folder}") return { @@ -613,9 +613,22 @@ def extract_text_from_file(file_path, graphname=None): if extension in ['.txt', '.md']: with open(file_path, 'r', encoding='utf-8') as f: return f.read().strip() - elif extension in ['.html', '.htm', '.csv']: + elif extension in ['.html', '.htm']: with open(file_path, 'r', encoding='utf-8') as f: return f.read().strip() + elif extension == '.csv': + raw = file_path.read_bytes() + # utf-8-sig handles UTF-8 with BOM (common Excel CSV export) + try: + return raw.decode('utf-8-sig').strip() + except UnicodeDecodeError: + pass + # Fall back to chardet detection + import chardet + detected = chardet.detect(raw) + encoding = detected.get('encoding') if detected.get('confidence', 0) >= 0.5 else None + # latin-1 as final fallback — never raises DecodeError + return raw.decode(encoding or 'latin-1').strip() elif extension == '.json': with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) @@ -624,6 +637,32 @@ def extract_text_from_file(file_path, graphname=None): import docx doc = docx.Document(file_path) return "\n".join(p.text for p in doc.paragraphs if p.text.strip()) + elif extension in ['.xlsx', '.xls']: + import pandas as pd + engine = 'openpyxl' if extension == '.xlsx' else 'xlrd' + try: + xl = pd.ExcelFile(file_path, engine=engine) + except Exception: + xl = pd.ExcelFile(file_path) + sheet_texts = [] + for sheet_name in xl.sheet_names: + # Always read with header=None so no data row is silently + # consumed as column names for headerless spreadsheets. + df = xl.parse(sheet_name, header=None) + if df.empty: + continue + df = df.fillna('') + # Detect header row: first row is all non-empty strings with + # no purely numeric values → treat as column names. + first_row = df.iloc[0] + if all(isinstance(v, str) and v.strip() for v in first_row): + df.columns = first_row.tolist() + df = df.iloc[1:].reset_index(drop=True) + else: + df.columns = [f"Column {i + 1}" for i in range(len(df.columns))] + sheet_md = df.to_markdown(index=False) + sheet_texts.append(f"## Sheet: {sheet_name}\n\n{sheet_md}") + return "\n\n".join(sheet_texts) if sheet_texts else "[Excel file is empty or contains no data]" elif extension == '.xml': import xml.etree.ElementTree as ET tree = ET.parse(file_path) @@ -663,7 +702,7 @@ def get_doc_type_from_extension(extension): def get_supported_extensions(): """Get list of supported file extensions.""" - return {'.txt', '.md', '.html', '.htm', '.csv', '.json', '.pdf', '.docx', '.xml', '.jpeg', '.jpg', '.png', '.gif'} + return {'.txt', '.md', '.html', '.htm', '.csv', '.json', '.pdf', '.docx', '.doc', '.xml', '.jpeg', '.jpg', '.png', '.gif', '.xlsx', '.xls', '.jsonl'} def is_supported_file(file_path): """Check if a file is supported for text extraction.""" diff --git a/graphrag-ui/src/pages/setup/IngestGraph.tsx b/graphrag-ui/src/pages/setup/IngestGraph.tsx index db9677a..37a0d7a 100644 --- a/graphrag-ui/src/pages/setup/IngestGraph.tsx +++ b/graphrag-ui/src/pages/setup/IngestGraph.tsx @@ -1006,6 +1006,32 @@ const IngestGraph: React.FC = ({ isModal = false }) => { ? `Upload destination: uploads/${ingestGraphName}/` : ""}

+ {selectedFiles && (() => { + const SUPPORTED_EXTENSIONS = new Set([".txt", ".md", ".pdf", ".docx", ".doc", ".html", ".htm", ".json", ".csv", ".xlsx", ".xls", ".xml", ".jpeg", ".jpg", ".png", ".gif", ".jsonl"]); + const files = Array.from(selectedFiles); + const unsupported = files.filter((f) => !SUPPORTED_EXTENSIONS.has(f.name.slice(f.name.lastIndexOf(".")).toLowerCase())); + const hasCsvExcel = files.some((f) => [".csv", ".xlsx", ".xls"].includes(f.name.slice(f.name.lastIndexOf(".")).toLowerCase())); + return ( + <> + {unsupported.length > 0 && ( +
+ ⚠️ +

+ Unsupported file type{unsupported.length > 1 ? "s" : ""}: {unsupported.map((f) => f.name).join(", ")}. These files will be skipped during ingestion. +

+
+ )} + {hasCsvExcel && ( +
+ ℹ️ +

+ CSV and Excel files will be treated as unstructured text documents. +

+
+ )} + + ); + })()}