Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,8 @@
{"name": "proxy server", "function": "proxy_server", "screenshot": "none" },

{"name": "render jinja template", "function": "render_jinja_template", "screenshot": "none" },
{"name": "download chrome extension", "function": "download_chrome_extension", "screenshot": "none" },
{"name": "download chrome extension", "function": "download_chrome_extension", "screenshot": "none" },
{"name": "AI - LLM prompt with files", "function": "AI_LLM_prompt_with_files", "screenshot": "none" },

) # yapf: disable

Expand Down
127 changes: 127 additions & 0 deletions Framework/Built_In_Automation/Sequential_Actions/common_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7107,3 +7107,130 @@ def download_chrome_extension(data_set):
except Exception:
return CommonUtil.Exception_Handler(sys.exc_info())


@logger
def AI_LLM_prompt_with_files(data_set):
"""
This action will extract the text from images using OpenAI's vision API. This action also takes user prompt and returns
the result according to the user prompt. If the user does not give any prompt, then by default it
extracts all text from the image and returns the result in JSON format.

Args:
data_set:
------------------------------------------------------------------------------
image | input parameter | %| image.png |%
user prompt | optional parameter | Extract invoice details
AI - LLM prompt with files | common action | AI - LLM prompt with files
------------------------------------------------------------------------------

Return:
`passed` if success
`zeuz_failed` if fails
"""
sModuleInfo = inspect.currentframe().f_code.co_name + " : " + MODULE_NAME
global selenium_driver

try:
import base64
import requests
import json
import os
user_image_path = None
user_prompt = None

for left, mid, right in data_set:
left = left.lower().replace(" ", "")
mid = mid.lower().replace(" ", "")
right = right.strip()

if left == 'image':
if right != '':
user_image_path = right

if left == "userprompt":
if right != '':
user_prompt = right

# Validate image path
if not user_image_path:
CommonUtil.ExecLog(sModuleInfo, "No image path provided. Please provide an image path.", 3)
return "zeuz_failed"

image_path = user_image_path
CommonUtil.ExecLog(sModuleInfo, f"Processing image: {image_path}", 1)

if not os.path.isfile(image_path):
CommonUtil.ExecLog(sModuleInfo, f"Image file not found: {image_path}", 3)
return "zeuz_failed"

prompt = user_prompt
if not prompt:
prompt = "Extract all text from this image and return the result in JSON format."

# Convert Image to Base64
with open(image_path, "rb") as img_file:
base64_image = base64.b64encode(img_file.read()).decode("utf-8")

# Load API key from .env file
try:
from dotenv import load_dotenv
framework_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we need to load these ourselves. load_dotenv() is already called when node_cli.py starts. So, if a user specifies and environment variable and starts node, it'll already be available in os.getenv() calls.

env_path = os.path.join(framework_dir, ".env")
load_dotenv(env_path)
api_key = os.getenv("OPENAI_API")
if not api_key:
CommonUtil.ExecLog(sModuleInfo, "OPENAI_API not found in .env file", 3)
return "zeuz_failed"
except Exception as e:
CommonUtil.ExecLog(sModuleInfo, f"Failed to load API key from .env: {str(e)}", 3)
return "zeuz_failed"

# Prepare API Request
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}

payload = {
"model": "gpt-4o",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Users should be able to specify the model name as part of the action parameters.
They should also be able to configure the temperature and any other associated hyperparameters of the model.

"messages": [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{base64_image}"
}
},
{
"type": "text",
"text": prompt
}
]
}
]
}

# Send Request
CommonUtil.ExecLog(sModuleInfo, "Analyzing image...", 1)
response = requests.post(
"https://api.openai.com/v1/chat/completions",
headers=headers,
data=json.dumps(payload)
)

# === 5. Process Response ===
if response.status_code == 200:
response_data = response.json()
extracted_data = response_data["choices"][0]["message"]["content"]
CommonUtil.ExecLog(sModuleInfo, f"Text extracted successfully from: {image_path}", 1)
CommonUtil.ExecLog(sModuleInfo, f"Extracted content: {extracted_data}", 5)
return "passed"
else:
CommonUtil.ExecLog(sModuleInfo, f"OpenAI API error: {response.status_code} - {response.text}", 3)
return "zeuz_failed"

except Exception:
return CommonUtil.Exception_Handler(sys.exc_info())

Loading