datacloud-customcode-python-sdk/src/datacustomcode/templates/function/payload/entrypoint.py at 3a51eb27ea2293493bbc4dd9bcc1a39fe28fc8f6 · diksha-sf/datacloud-customcode-python-sdk · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import logging
from typing import List
from uuid import uuid4

logger = logging.getLogger(__name__)


def chunk_text(text: str, chunk_size: int = 1000) -> List[str]:
    """
    Split text into chunks of approximately chunk_size characters.
    Tries to split at sentence boundaries when possible.
    """
    if not text:
        return []

    chunks = []
    current_chunk = ""

    # Split text into sentences (simple split by period)
    sentences = text.split(". ")

    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= chunk_size:
            current_chunk += sentence + ". "
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence + ". "

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks


def function(request: dict) -> dict:
    logger.info("Inside Function")
    logger.info(request)

    items = request["input"]
    output_chunks = []
    current_seq_no = 1  # Start sequence number from 1

    for item in items:
        # Item is DocElement as dict
        logger.info(f"Processing item: {item}")

        text = item.get("text", "")
        metadata = item.get("metadata", {})

        # Create chunks from the text
        text_chunks = chunk_text(text, chunk_size=100)  # Using a larger chunk size

        # Create chunk dictionaries for each text chunk
        for chunk_content in text_chunks:
            chunk_dict = {
                "text": chunk_content,
                "metadata": metadata,
                "seq_no": current_seq_no,
                "chunk_type": "text",
                "chunk_id": str(uuid4()),
                "tag_metadata": {},
                "citations": {},
                "source_record": item,
            }
            output_chunks.append(chunk_dict)
            current_seq_no += 1  # Increment sequence number for next chunk

    logger.info("Completed chunking")
    response = {
        "output": output_chunks,
        "status": {"status_type": "success", "status_message": "Chunking completed"},
    }
    logger.info(response)
    return response


# Test the function
if __name__ == "__main__":
    # Configure logging
    logging.basicConfig(level=logging.INFO)

    # Create test data with two DocElements
    test_request = {
        "input": [
            {
                "text": (
                    """This is the first sentence of the first document, which is
                    intentionally made longer to test chunking. """
                    """Here is the second sentence of the first document, which is also
                     quite long and should ensure that the chunking function splits
                     this text into two chunks when the chunk size is set to 100."""
                ),
                "metadata": {"source": "test1", "type": "document"},
            },
            {
                "text": (
                    """This is the first sentence of the second document, and it is
                    also extended to be longer than usual for testing purposes. """
                    """The second sentence of the second document is similarly lengthy,
                     so that the chunking function will again create two chunks for
                     this document."""
                ),
                "metadata": {"source": "test2", "type": "document"},
            },
        ]
    }

    # Run the function
    result = function(test_request)

    # Print the results in a more readable format
    print("\nChunking Results:")
    print("----------------")
    for chunk in result["output"]:
        print(f"\nChunk #{chunk['seq_no']}:")
        print(f"Text: {chunk['text'][:100]}...")  # Print first 100 chars of each chunk
        print(f"Source: {chunk['metadata']['source']}")
        print(f"Chunk ID: {chunk['chunk_id']}")