Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions common/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ nest-asyncio==1.6.0
nltk==3.9.1
numpy>=1, <2
openai==1.92.2
openpyxl>=3.1.0
xlrd>=2.0.1
ordered-set==4.1.0
orjson==3.10.18
packaging==24.2
Expand Down
45 changes: 42 additions & 3 deletions common/utils/text_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ async def process_with_semaphore(file_path):
'error': result.get('error', 'Unknown error')
})

logger.info(f"Prepared {len(processed_files_info)} files ({len(jsonl_files_copied)} JSONL copied, {len(files_to_process)} converted), {total_docs} total documents")
logger.info(f"Processed {len(processed_files_info)} files, extracted {total_docs} total documents")
logger.info(f"Created {len([f for f in processed_files_info if f.get('status') == 'success'])} JSONL files in {temp_folder}")

return {
Expand Down Expand Up @@ -613,9 +613,22 @@ def extract_text_from_file(file_path, graphname=None):
if extension in ['.txt', '.md']:
with open(file_path, 'r', encoding='utf-8') as f:
return f.read().strip()
elif extension in ['.html', '.htm', '.csv']:
elif extension in ['.html', '.htm']:
with open(file_path, 'r', encoding='utf-8') as f:
return f.read().strip()
elif extension == '.csv':
raw = file_path.read_bytes()
# utf-8-sig handles UTF-8 with BOM (common Excel CSV export)
try:
return raw.decode('utf-8-sig').strip()
except UnicodeDecodeError:
pass
# Fall back to chardet detection
import chardet
detected = chardet.detect(raw)
encoding = detected.get('encoding') if detected.get('confidence', 0) >= 0.5 else None
# latin-1 as final fallback — never raises DecodeError
return raw.decode(encoding or 'latin-1').strip()
elif extension == '.json':
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
Expand All @@ -624,6 +637,32 @@ def extract_text_from_file(file_path, graphname=None):
import docx
doc = docx.Document(file_path)
return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
elif extension in ['.xlsx', '.xls']:
import pandas as pd
engine = 'openpyxl' if extension == '.xlsx' else 'xlrd'
try:
xl = pd.ExcelFile(file_path, engine=engine)
except Exception:
xl = pd.ExcelFile(file_path)
sheet_texts = []
for sheet_name in xl.sheet_names:
# Always read with header=None so no data row is silently
# consumed as column names for headerless spreadsheets.
df = xl.parse(sheet_name, header=None)
if df.empty:
continue
df = df.fillna('')
# Detect header row: first row is all non-empty strings with
# no purely numeric values → treat as column names.
first_row = df.iloc[0]
if all(isinstance(v, str) and v.strip() for v in first_row):
df.columns = first_row.tolist()
df = df.iloc[1:].reset_index(drop=True)
else:
df.columns = [f"Column {i + 1}" for i in range(len(df.columns))]
sheet_md = df.to_markdown(index=False)
sheet_texts.append(f"## Sheet: {sheet_name}\n\n{sheet_md}")
return "\n\n".join(sheet_texts) if sheet_texts else "[Excel file is empty or contains no data]"
Comment thread
prinskumar-tigergraph marked this conversation as resolved.
elif extension == '.xml':
import xml.etree.ElementTree as ET
tree = ET.parse(file_path)
Expand Down Expand Up @@ -663,7 +702,7 @@ def get_doc_type_from_extension(extension):

def get_supported_extensions():
"""Get list of supported file extensions."""
return {'.txt', '.md', '.html', '.htm', '.csv', '.json', '.pdf', '.docx', '.xml', '.jpeg', '.jpg', '.png', '.gif'}
return {'.txt', '.md', '.html', '.htm', '.csv', '.json', '.pdf', '.docx', '.doc', '.xml', '.jpeg', '.jpg', '.png', '.gif', '.xlsx', '.xls', '.jsonl'}

def is_supported_file(file_path):
"""Check if a file is supported for text extraction."""
Expand Down
26 changes: 26 additions & 0 deletions graphrag-ui/src/pages/setup/IngestGraph.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -1006,6 +1006,32 @@ const IngestGraph: React.FC<IngestGraphProps> = ({ isModal = false }) => {
? `Upload destination: uploads/${ingestGraphName}/`
: ""}
</p>
{selectedFiles && (() => {
const SUPPORTED_EXTENSIONS = new Set([".txt", ".md", ".pdf", ".docx", ".doc", ".html", ".htm", ".json", ".csv", ".xlsx", ".xls", ".xml", ".jpeg", ".jpg", ".png", ".gif", ".jsonl"]);
const files = Array.from(selectedFiles);
const unsupported = files.filter((f) => !SUPPORTED_EXTENSIONS.has(f.name.slice(f.name.lastIndexOf(".")).toLowerCase()));
const hasCsvExcel = files.some((f) => [".csv", ".xlsx", ".xls"].includes(f.name.slice(f.name.lastIndexOf(".")).toLowerCase()));
return (
<>
{unsupported.length > 0 && (
<div className="flex items-start gap-2 mt-2 p-2 rounded-md bg-red-50 dark:bg-red-900/20 border border-red-200 dark:border-red-700">
<span className="text-red-500 mt-0.5 shrink-0">⚠️</span>
<p className="text-xs text-red-700 dark:text-red-300">
Unsupported file type{unsupported.length > 1 ? "s" : ""}: <strong>{unsupported.map((f) => f.name).join(", ")}</strong>. These files will be skipped during ingestion.
</p>
</div>
)}
{hasCsvExcel && (
<div className="flex items-start gap-2 mt-2 p-2 rounded-md bg-amber-50 dark:bg-amber-900/20 border border-amber-200 dark:border-amber-700">
<span className="text-amber-500 mt-0.5 shrink-0">ℹ️</span>
<p className="text-xs text-amber-700 dark:text-amber-300">
CSV and Excel files will be treated as unstructured text documents.
</p>
</div>
)}
</>
);
})()}
</div>

<div className="flex gap-2">
Expand Down