Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions internnav/agent/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from internnav.agent.base import Agent
from internnav.agent.dialog_agent import DialogAgent
from internnav.agent.cma_agent import CmaAgent
from internnav.agent.rdp_agent import RdpAgent
from internnav.agent.seq2seq_agent import Seq2SeqAgent
from internnav.agent.internvla_n1_agent import InternVLAN1Agent

__all__ = [
'Agent',
'DialogAgent',
'CmaAgent',
'RdpAgent',
'Seq2SeqAgent',
Expand Down
1 change: 1 addition & 0 deletions internnav/agent/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def decorator(agent_class):
if agent_type in cls.agents:
raise ValueError(f"Agent {agent_type} already registered.")
cls.agents[agent_type] = agent_class
return agent_class

return decorator

Expand Down
506 changes: 506 additions & 0 deletions internnav/agent/dialog_agent.py

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions internnav/configs/evaluator/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ class MetricCfg(BaseModel):

class TaskCfg(BaseModel):
task_name: Optional[str] = None
task_settings: Dict[str, Any]
scene: SceneCfg
task_settings: Dict[str, Any] = None
scene: SceneCfg = None
robot_name: Optional[str] = None
robot: Optional[RobotCfg] = None
robot_flash: Optional[bool] = None
Expand All @@ -56,6 +56,7 @@ class EvalDatasetCfg(BaseModel):


class EvalCfg(BaseModel):
remote_agent: Optional[bool] = None
eval_type: Optional[str] = None
eval_settings: Optional[Dict[str, Any]] = {}
agent: Optional[AgentCfg] = None
Expand Down
3 changes: 2 additions & 1 deletion internnav/env/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from internnav.env.base import Env
from internnav.env.internutopia_env import InternutopiaEnv
from internnav.env.habitat_env import HabitatEnv

__all__ = ['Env', 'InternutopiaEnv']
__all__ = ['Env', 'InternutopiaEnv', 'HabitatEnv']
1 change: 1 addition & 0 deletions internnav/env/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def decorator(env_class):
if env_type in cls.envs:
raise ValueError(f"Env {env_type} already registered.")
cls.envs[env_type] = env_class
return env_class

return decorator

Expand Down
173 changes: 173 additions & 0 deletions internnav/env/dialog_mp3d.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
import cv2
import numpy as np
from typing import List, Tuple, Union

def fill_small_holes(depth_img: np.ndarray, area_thresh: int) -> np.ndarray:
"""
Identifies regions in the depth image that have a value of 0 and fills them in
with 1 if the region is smaller than a given area threshold.

Args:
depth_img (np.ndarray): The input depth image
area_thresh (int): The area threshold for filling in holes

Returns:
np.ndarray: The depth image with small holes filled in
"""
# Create a binary image where holes are 1 and the rest is 0
binary_img = np.where(depth_img == 0, 1, 0).astype("uint8")

# Find contours in the binary image
contours, _ = cv2.findContours(binary_img, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

filled_holes = np.zeros_like(binary_img)

for cnt in contours:
# If the area of the contour is smaller than the threshold
if cv2.contourArea(cnt) < area_thresh:
# Fill the contour
cv2.drawContours(filled_holes, [cnt], 0, 1, -1)

# Create the filled depth image
filled_depth_img = np.where(filled_holes == 1, 1, depth_img)

return filled_depth_img

class MP3DGTPerception:
def __init__(self, max_depth, min_depth, fx, fy):
self.max_depth = max_depth
self.min_depth = min_depth
self.fx = fx
self.fy = fy

def predict(self, depth, targets, tf_camera_to_ply, area_threshold=2500):
'''
Get the gt semantic map of the target objects
image: (H, W, 3) current rgb frame
depth: (H, W) current depth frame
targets: (N, 6) bboxes of the target objects, first 3 are coordinates of min corner, last 3 are coordinates of max corner
area_threshold: int
return: (N, H, W) gt semantic map of the target objects
'''
# get the point clouds of current frame
filled_depth = fill_small_holes(depth, area_threshold)
scaled_depth = filled_depth * (self.max_depth - self.min_depth) + self.min_depth
mask = scaled_depth < self.max_depth
point_cloud_camera_frame = get_point_cloud(scaled_depth, mask, self.fx, self.fy)
point_cloud_ply_frame = transform_points(tf_camera_to_ply, point_cloud_camera_frame)

# mark the points in the target objects' bboxes
semantic_images = []
for target in targets:
min_x, min_y, min_z = target[:3]
max_x, max_y, max_z = target[3:]

in_bbox = (
(point_cloud_ply_frame[:, 0] >= min_x) &
(point_cloud_ply_frame[:, 0] <= max_x) &
(point_cloud_ply_frame[:, 1] >= min_y) &
(point_cloud_ply_frame[:, 1] <= max_y) &
(point_cloud_ply_frame[:, 2] >= min_z) &
(point_cloud_ply_frame[:, 2] <= max_z)
)
in_bbox_points = point_cloud_ply_frame[in_bbox]
semantic_image = np.zeros(depth.shape, dtype=np.uint8)
if len(in_bbox_points) > 0:
# map the marked points back to the image to get the semantic map
in_bbox_camera_frame = inverse_transform_points(tf_camera_to_ply, in_bbox_points)
in_box_image_coords = project_points_to_image(in_bbox_camera_frame, self.fx, self.fy, depth.shape)
try:
mask = [in_box_image_coords[i, 0] < 480 and in_box_image_coords[i, 1] < 640 for i in range(len(in_box_image_coords))]
in_box_image_coords = in_box_image_coords[mask]
semantic_image[in_box_image_coords[:, 0], in_box_image_coords[:, 1]] = 1
except:
import ipdb; ipdb.set_trace()
print()
semantic_image = fill_small_holes(semantic_image, area_threshold)
semantic_images.append(semantic_image)
if len(semantic_images) > 0:
semantic_images = np.stack(semantic_images, axis=0)
else:
semantic_images = np.zeros((1, depth.shape[0], depth.shape[1]), dtype=np.uint8)
return semantic_images

def transform_points(transformation_matrix: np.ndarray, points: np.ndarray) -> np.ndarray:
# Add a homogeneous coordinate of 1 to each point for matrix multiplication
homogeneous_points = np.hstack((points, np.ones((points.shape[0], 1))))

# Apply the transformation matrix to the points
transformed_points = np.dot(transformation_matrix, homogeneous_points.T).T

# Remove the added homogeneous coordinate and divide by the last coordinate
return transformed_points[:, :3] / transformed_points[:, 3:]


def get_point_cloud(depth_image: np.ndarray, mask: np.ndarray, fx: float, fy: float) -> np.ndarray:
"""Calculates the 3D coordinates (x, y, z) of points in the depth image based on
the horizontal field of view (HFOV), the image width and height, the depth values,
and the pixel x and y coordinates.

Args:
depth_image (np.ndarray): 2D depth image.
mask (np.ndarray): 2D binary mask identifying relevant pixels.
fx (float): Focal length in the x direction.
fy (float): Focal length in the y direction.

Returns:
np.ndarray: Array of 3D coordinates (x, y, z) of the points in the image plane.
"""
v, u = np.where(mask)
z = depth_image[v, u]
x = (u - depth_image.shape[1] // 2) * z / fx
y = (v - depth_image.shape[0] // 2) * z / fy
cloud = np.stack((x, -y, -z), axis=-1)

return cloud

def inverse_transform_points(transformation_matrix: np.ndarray, points: np.ndarray) -> np.ndarray:
"""Convert point cloud from episodic coordinate system to camera coordinate system

Args:
transformation_matrix (np.ndarray): 4x4 transformation matrix
points (np.ndarray): Point cloud coordinates (N, 3)

Returns:
np.ndarray: Point cloud coordinates in camera coordinate system (N, 3)
"""
# Calculate the inverse of the transformation matrix
inv_matrix = np.linalg.inv(transformation_matrix)

# Add a homogeneous coordinate of 1 to each point for matrix multiplication
homogeneous_points = np.hstack((points, np.ones((points.shape[0], 1))))

# Apply the inverse transformation
transformed_points = np.dot(inv_matrix, homogeneous_points.T).T

# Remove the added homogeneous coordinate
return transformed_points[:, :3] / transformed_points[:, 3:]

def project_points_to_image(points: np.ndarray, fx: float, fy: float, image_shape: tuple) -> np.ndarray:
"""Project points from camera coordinate system to image plane

Args:
points (np.ndarray): Points in camera coordinate system (N, 3)
fx (float): x-axis focal length
fy (float): y-axis focal length
image_shape (tuple): Image dimensions (height, width)

Returns:
np.ndarray: Image coordinates (N, 2)
"""
points = np.stack((points[:,0], -points[:,1], -points[:,2]), axis=-1)
# Ensure points are in front of the camera
valid_mask = points[:, 2] > 0 # z > 0

# Calculate image coordinates
u = points[:, 0] * fx / points[:, 2] + image_shape[1] // 2
v = points[:, 1] * fy / points[:, 2] + image_shape[0] // 2

# Combine coordinates
image_coords = np.stack((v, u), axis=-1)
image_coords = image_coords.astype(np.int32)
# Return valid points only
return image_coords[valid_mask]
163 changes: 163 additions & 0 deletions internnav/env/habitat_env.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
import json
import os
import quaternion
import numpy as np
from typing import Any, Dict, List, Optional

from habitat.config.default import get_agent_config

from depth_camera_filtering import filter_depth
from internnav.configs.evaluator import EnvCfg, TaskCfg
from internnav.env import base
from .dialog_mp3d import MP3DGTPerception

@base.Env.register('habitat')
class HabitatEnv(base.Env):
def __init__(self, env_config: EnvCfg, task_config: TaskCfg= None):
"""
env_settings include:
- habitat_config: loaded from get_habitat_config
- rank: int, rank index for sharding
- world_size: int, total number of ranks
"""
try:
from habitat import Env
except ImportError as e:
raise RuntimeError(
"Habitat modules could not be imported. " "Make sure both repositories are installed and on PYTHONPATH."
) from e

super().__init__(env_config, task_config)
self.config = env_config.env_settings['habitat_config']
self._env = Env(self.config)

self.rank = env_config.env_settings.get('rank', 0)
self.world_size = env_config.env_settings.get('world_size', 1)
self._current_episode_index: int = 0
self._last_obs: Optional[Dict[str, Any]] = None

self.is_running = True
self.output_path = env_config.env_settings.get('output_path', './output')

agent_config = get_agent_config(self.config.simulator)
self.min_depth = agent_config.sim_sensors.depth_sensor.min_depth
self.max_depth = agent_config.sim_sensors.depth_sensor.max_depth
self._camera_fov = np.deg2rad(agent_config.sim_sensors.depth_sensor.hfov)
self._fx = self._fy = agent_config.sim_sensors.depth_sensor.width / (2 * np.tan(self._camera_fov / 2))
self._camera_height = agent_config.sim_sensors.rgb_sensor.position[1]
self.segmentation = MP3DGTPerception(self.max_depth, self.min_depth, self._fx, self._fy)

# generate episodes
# self._env.episodes = self._env.episodes[0:1] # for debug
self.episodes = self.generate_episodes()
# print(self.episodes)

def generate_episodes(self) -> List[Any]:
"""
Generate list of episodes for the current split, already:
- grouped by scene
- filtered by done_res (the path is self.output_path/progress.json)
- sharded by (rank, world_size)
"""
all_episodes = []

# group episodes by scene
scene_episode_dict: Dict[str, List[Any]] = {}
for episode in self._env.episodes:
scene_episode_dict.setdefault(episode.scene_id, []).append(episode)

# load done_res
done_res = set()
result_path = os.path.join(self.output_path, 'progress.json')
if os.path.exists(result_path):
with open(result_path, 'r') as f:
for line in f:
res = json.loads(line)
# only skip if current format has scene_id
if "scene_id" in res:
done_res.add((res["scene_id"], res["episode_id"]))

# iterate scenes in order, collect all episodes
for scene in sorted(scene_episode_dict.keys()):
per_scene_eps = scene_episode_dict[scene]
scene_id = scene.split('/')[-2]

# shard by rank index / world_size
for episode in per_scene_eps[self.rank :: self.world_size]:
episode_id = int(episode.episode_id)
if (scene_id, episode_id) in done_res:
continue
all_episodes.append(episode)

return all_episodes

def reset(self):
"""
load next episode and return first observation
"""
# no more episodes
if not (0 <= self._current_episode_index < len(self.episodes)):
self.is_running = False
return

# Manually set to next episode in habitat
self._env.current_episode = self.episodes[self._current_episode_index]
self._current_episode_index += 1

# Habitat reset
self._last_obs = self._env.reset()
if "instance" in self.task_config.task_name:
self._last_obs['semantic'] = self.get_semantic(self._last_obs)
return self._last_obs

def step(self, action: List[Any]):
"""
step the environment with given action

Args: action: List[Any], action for each env in the batch

Return: obs, reward, done, info
"""
obs = self._env.step(action)
if "instance" in self.task_config.task_name:
obs['semantic'] = self.get_semantic(obs)
done = self._env.episode_over
info = self._env.get_metrics()
reward = info.get('reward', 0.0)
return obs, reward, done, info

def close(self):
print('Habitat Env close')
self._env.close()

def render(self):
self._env.render()

def get_observation(self) -> Dict[str, Any]:
return self._env.get_observations()

def get_metrics(self) -> Dict[str, Any]:
return self._env.get_metrics()

def get_current_episode(self):
return self._env.current_episode

def get_tf_episodic_to_global(self):
agent_state = self._env.sim.get_agent_state()
rotation = agent_state.rotation
translation = agent_state.position
rotation_matrix = quaternion.as_rotation_matrix(rotation)
tf_episodic_to_global = np.eye(4)
tf_episodic_to_global[:3, :3] = rotation_matrix
tf_episodic_to_global[:3, 3] = translation
return tf_episodic_to_global

def get_semantic(self, obs: dict):
targets = [self.get_current_episode().goals[idx].bbox for idx, _ in enumerate(self.get_current_episode().instruction.instance_id)]
targets = np.array([[target[0], min(-target[2], -target[5]) , target[1], target[3], max(-target[5], -target[2]), target[4]] for target in targets])
depth = filter_depth(obs["depth"].reshape(obs["depth"].shape[:2]), blur_type=None)
tf_camera_to_global = self.get_tf_episodic_to_global()
tf_camera_to_global[1, 3] = self._camera_height + self._env.sim.get_agent_state().position[1]
tf_camera_to_ply = np.dot(np.array([[1, 0, 0, 0], [0, 0, -1, 0], [0, 1, 0, 0], [0, 0, 0, 1]]), tf_camera_to_global)
semantic = self.segmentation.predict(depth, targets, tf_camera_to_ply)
return semantic
Loading
Loading