From 2f2cfb749c5c96bd6926de5f340fb62bdc4a0340 Mon Sep 17 00:00:00 2001 From: Deepak <89829542+Deepak-Kesavan@users.noreply.github.com> Date: Mon, 21 Apr 2025 13:07:53 +0530 Subject: [PATCH 1/4] Minor fixes Signed-off-by: Deepak <89829542+Deepak-Kesavan@users.noreply.github.com> --- .../prompt_studio_core_v2/prompt_studio_helper.py | 10 ++-------- .../src/unstract/prompt_service/core/index_v2.py | 4 ++-- .../src/unstract/prompt_service/helpers/usage.py | 10 +++++----- .../src/unstract/prompt_service/services/indexing.py | 4 ---- tools/structure/src/helpers.py | 11 +++++++---- 5 files changed, 16 insertions(+), 23 deletions(-) diff --git a/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py b/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py index f36a08500..e55e1cd62 100644 --- a/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py +++ b/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py @@ -378,13 +378,6 @@ def index_document( fs=fs_instance, tool=util, ) - if DocumentIndexingService.is_document_indexing( - org_id=org_id, user_id=user_id, doc_id_key=doc_id - ): - return { - "status": IndexingStatus.PENDING_STATUS.value, - "output": IndexingStatus.DOCUMENT_BEING_INDEXED.value, - } extracted_text = PromptStudioHelper.dynamic_extractor( profile_manager=default_profile, file_path=file_path, @@ -1360,7 +1353,8 @@ def dynamic_extractor( status_code = response.get("status_code") if status_code == 200: response_data = response.get("structure_output") - extracted_text = json.loads(response_data) + structure_output = json.loads(response_data) + extracted_text = structure_output.get("extracted_text") PromptStudioIndexHelper.mark_extraction_status( document_id=document_id, profile_manager=profile_manager, diff --git a/prompt-service/src/unstract/prompt_service/core/index_v2.py b/prompt-service/src/unstract/prompt_service/core/index_v2.py index e3c1342ad..65d67d214 100644 --- a/prompt-service/src/unstract/prompt_service/core/index_v2.py +++ b/prompt-service/src/unstract/prompt_service/core/index_v2.py @@ -153,7 +153,7 @@ def perform_indexing( vector_db: VectorDB, doc_id: str, extracted_text: str, - doc_id_found: str, + doc_id_found: bool, ): if isinstance( vector_db.get_vector_db( @@ -183,7 +183,7 @@ def _trigger_indexing(self, vector_db, documents): try: vector_db.index_document( documents, - chunk_size=self.chunking_config.chunk_overlap, + chunk_size=self.chunking_config.chunk_size, chunk_overlap=self.chunking_config.chunk_overlap, show_progress=True, ) diff --git a/prompt-service/src/unstract/prompt_service/helpers/usage.py b/prompt-service/src/unstract/prompt_service/helpers/usage.py index eecade1bd..8fd5ea010 100644 --- a/prompt-service/src/unstract/prompt_service/helpers/usage.py +++ b/prompt-service/src/unstract/prompt_service/helpers/usage.py @@ -40,11 +40,11 @@ def query_usage_metadata(token: str, metadata: dict[str, Any]) -> dict[str, Any] # Process results as needed for row in results: key, item = UsageHelper._get_key_and_item(row) - # Initialize the key as an empty list if it doesn't exist - if key not in metadata: - metadata[key] = [] - # Append the item to the list associated with the key - metadata[key].append(item) + # Initialize the key as an empty list if it doesn't exist + if key not in metadata: + metadata[key] = [] + # Append the item to the list associated with the key + metadata[key].append(item) except Exception as e: logger.error(f"Error while querying usage metadata: {e}") return metadata diff --git a/prompt-service/src/unstract/prompt_service/services/indexing.py b/prompt-service/src/unstract/prompt_service/services/indexing.py index 9205294c8..d4090facf 100644 --- a/prompt-service/src/unstract/prompt_service/services/indexing.py +++ b/prompt-service/src/unstract/prompt_service/services/indexing.py @@ -1,6 +1,5 @@ import logging -from flask import current_app as app from unstract.prompt_service.core.index_v2 import Index from unstract.prompt_service.dto import ( ChunkingConfig, @@ -70,9 +69,6 @@ def index( embedding=embedding, vector_db=vector_db, ) - if doc_id_found: - app.logger.info("Doc ID found: %s", doc_id_found) - return doc_id # Index and return doc_id index.perform_indexing( diff --git a/tools/structure/src/helpers.py b/tools/structure/src/helpers.py index aac6a4293..8855c477c 100644 --- a/tools/structure/src/helpers.py +++ b/tools/structure/src/helpers.py @@ -1,4 +1,5 @@ import datetime +import json from typing import Any, Optional from constants import IndexingConstants as IKeys @@ -42,8 +43,10 @@ def dynamic_extraction( prompt_port=tool.get_env_or_die(SettingsKeys.PROMPT_PORT), ) tool.stream_log(f"responder : {responder}") - extracted_text = responder.extract(payload=payload) - + response = responder.extract(payload=payload) + response_data = response.get("structure_output") + structure_output = json.loads(response_data) + extracted_text = structure_output.get("extracted_text") return extracted_text @staticmethod @@ -59,8 +62,8 @@ def dynamic_indexing( chunk_size: int, chunk_overlap: int, file_hash: Optional[str] = None, - tool_id: str = None, - extracted_text: str = None, + tool_id: Optional[str] = None, + extracted_text: Optional[str] = None, ) -> str: x2text = tool_settings[SettingsKeys.X2TEXT_ADAPTER] From 62a18a6f0b41a7a6cbb20ca64d7687cfa21be2b2 Mon Sep 17 00:00:00 2001 From: Deepak <89829542+Deepak-Kesavan@users.noreply.github.com> Date: Tue, 22 Apr 2025 16:42:45 +0530 Subject: [PATCH 2/4] Change to use summary properly Signed-off-by: Deepak <89829542+Deepak-Kesavan@users.noreply.github.com> --- tools/structure/src/main.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/tools/structure/src/main.py b/tools/structure/src/main.py index 0f0d79492..cc84e236f 100644 --- a/tools/structure/src/main.py +++ b/tools/structure/src/main.py @@ -131,7 +131,7 @@ def run( if tool_settings[SettingsKeys.ENABLE_SINGLE_PASS_EXTRACTION]: if summarize_as_source: - summarize_file_hash = self._summarize_and_index( + summarize_file_path, summarize_file_hash = self._summarize_and_index( tool_settings=tool_settings, tool_data_dir=tool_data_dir, responder=responder, @@ -139,6 +139,7 @@ def run( usage_kwargs=usage_kwargs, ) payload[SettingsKeys.FILE_HASH] = summarize_file_hash + payload[SettingsKeys.FILE_PATH] = summarize_file_path self.stream_log("Fetching response for single pass extraction") # Since indexing is not involved for single pass index_metrics = {"time_taken(s)": 0} @@ -152,15 +153,18 @@ def run( reindex = True for output in outputs: if summarize_as_source: - summarize_file_hash = self._summarize_and_index( - tool_settings=tool_settings, - tool_data_dir=tool_data_dir, - responder=responder, - outputs=outputs, - usage_kwargs=usage_kwargs, + summarize_file_path, summarize_file_hash = ( + self._summarize_and_index( + tool_settings=tool_settings, + tool_data_dir=tool_data_dir, + responder=responder, + outputs=outputs, + usage_kwargs=usage_kwargs, + ) ) payload[SettingsKeys.OUTPUTS] = outputs payload[SettingsKeys.FILE_HASH] = summarize_file_hash + payload[SettingsKeys.FILE_PATH] = summarize_file_path # Since indexing is not involved for summary index_metrics[output[SettingsKeys.NAME]] = {"time_taken(s)": 0} break @@ -282,7 +286,7 @@ def _summarize_and_index( responder: PromptTool, outputs: dict[str, Any], usage_kwargs: dict[Any, Any] = {}, - ) -> str: + ) -> tuple[str, str]: """Summarizes the context of the file and indexes the summarized content. @@ -345,11 +349,10 @@ def _summarize_and_index( path=summarize_file_path, mode="w", data=summarized_context ) - self.stream_log("Indexing summarized context") summarize_file_hash: str = self.workflow_filestorage.get_hash_from_file( path=summarize_file_path ) - return summarize_file_hash + return str(summarize_file_path), summarize_file_hash if __name__ == "__main__": From 5395fa79fc7416041c11458753c6cdd05205e8fe Mon Sep 17 00:00:00 2001 From: Deepak <89829542+Deepak-Kesavan@users.noreply.github.com> Date: Tue, 22 Apr 2025 16:45:06 +0530 Subject: [PATCH 3/4] structure tool version bump Signed-off-by: Deepak <89829542+Deepak-Kesavan@users.noreply.github.com> --- backend/sample.env | 4 ++-- tools/structure/src/config/properties.json | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/sample.env b/backend/sample.env index c149e73e6..117a97e3a 100644 --- a/backend/sample.env +++ b/backend/sample.env @@ -75,9 +75,9 @@ PROMPT_STUDIO_FILE_PATH=/app/prompt-studio-data # Structure Tool Image (Runs prompt studio exported tools) # https://hub.docker.com/r/unstract/tool-structure -STRUCTURE_TOOL_IMAGE_URL="docker:unstract/tool-structure:0.0.75" +STRUCTURE_TOOL_IMAGE_URL="docker:unstract/tool-structure:0.0.76" STRUCTURE_TOOL_IMAGE_NAME="unstract/tool-structure" -STRUCTURE_TOOL_IMAGE_TAG="0.0.75" +STRUCTURE_TOOL_IMAGE_TAG="0.0.76" # Feature Flags EVALUATION_SERVER_IP=unstract-flipt diff --git a/tools/structure/src/config/properties.json b/tools/structure/src/config/properties.json index 9b618c952..dfa8e2142 100644 --- a/tools/structure/src/config/properties.json +++ b/tools/structure/src/config/properties.json @@ -2,7 +2,7 @@ "schemaVersion": "0.0.1", "displayName": "Structure Tool", "functionName": "structure_tool", - "toolVersion": "0.0.75", + "toolVersion": "0.0.76", "description": "This is a template tool which can answer set of input prompts designed in the Prompt Studio", "input": { "description": "File that needs to be indexed and parsed for answers" From 5b5bc0cb3c808c02c1ef651d7fb0f8ebc122a87d Mon Sep 17 00:00:00 2001 From: Deepak <89829542+Deepak-Kesavan@users.noreply.github.com> Date: Wed, 23 Apr 2025 10:47:09 +0530 Subject: [PATCH 4/4] File path fix Signed-off-by: Deepak <89829542+Deepak-Kesavan@users.noreply.github.com> --- .../src/unstract/prompt_service/services/extraction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prompt-service/src/unstract/prompt_service/services/extraction.py b/prompt-service/src/unstract/prompt_service/services/extraction.py index 6d188425f..c6f0377b7 100644 --- a/prompt-service/src/unstract/prompt_service/services/extraction.py +++ b/prompt-service/src/unstract/prompt_service/services/extraction.py @@ -86,7 +86,7 @@ def update_exec_metadata( metadata = {X2TextConstants.WHISPER_HASH: whisper_hash_value} for key, value in metadata.items(): tool_exec_metadata[key] = value - metadata_path = str(Path(execution_run_data_folder / IKeys.METADATA_FILE)) + metadata_path = str(Path(execution_run_data_folder) / IKeys.METADATA_FILE) ToolUtils.dump_json( file_to_dump=metadata_path, json_to_dump=metadata,