InternRobotics · 0309hws · Nov 3, 2025 · Nov 10, 2025 · Nov 11, 2025 · Nov 12, 2025
diff --git a/internnav/agent/__init__.py b/internnav/agent/__init__.py
@@ -1,11 +1,13 @@
 from internnav.agent.base import Agent
+from internnav.agent.dialog_agent import DialogAgent
 from internnav.agent.cma_agent import CmaAgent
 from internnav.agent.rdp_agent import RdpAgent
 from internnav.agent.seq2seq_agent import Seq2SeqAgent
 from internnav.agent.internvla_n1_agent import InternVLAN1Agent
 
 __all__ = [
     'Agent',
+    'DialogAgent',
     'CmaAgent',
     'RdpAgent',
     'Seq2SeqAgent',

diff --git a/internnav/agent/base.py b/internnav/agent/base.py
@@ -25,6 +25,7 @@ def decorator(agent_class):
             if agent_type in cls.agents:
                 raise ValueError(f"Agent {agent_type} already registered.")
             cls.agents[agent_type] = agent_class
+            return agent_class
 
         return decorator
 

diff --git a/internnav/agent/dialog_agent.py b/internnav/agent/dialog_agent.py
diff --git a/internnav/configs/evaluator/__init__.py b/internnav/configs/evaluator/__init__.py
@@ -39,8 +39,8 @@ class MetricCfg(BaseModel):
 
 class TaskCfg(BaseModel):
     task_name: Optional[str] = None
-    task_settings: Dict[str, Any]
-    scene: SceneCfg
+    task_settings: Dict[str, Any] = None
+    scene: SceneCfg = None
     robot_name: Optional[str] = None
     robot: Optional[RobotCfg] = None
     robot_flash: Optional[bool] = None
@@ -56,6 +56,7 @@ class EvalDatasetCfg(BaseModel):
 
 
 class EvalCfg(BaseModel):
+    remote_agent: Optional[bool] = None
     eval_type: Optional[str] = None
     eval_settings: Optional[Dict[str, Any]] = {}
     agent: Optional[AgentCfg] = None

diff --git a/internnav/env/__init__.py b/internnav/env/__init__.py
@@ -1,4 +1,5 @@
 from internnav.env.base import Env
 from internnav.env.internutopia_env import InternutopiaEnv
+from internnav.env.habitat_env import HabitatEnv
 
-__all__ = ['Env', 'InternutopiaEnv']
+__all__ = ['Env', 'InternutopiaEnv', 'HabitatEnv']
diff --git a/internnav/env/base.py b/internnav/env/base.py
@@ -42,6 +42,7 @@ def decorator(env_class):
             if env_type in cls.envs:
                 raise ValueError(f"Env {env_type} already registered.")
             cls.envs[env_type] = env_class
+            return env_class
 
         return decorator
 

diff --git a/internnav/env/dialog_mp3d.py b/internnav/env/dialog_mp3d.py
@@ -0,0 +1,173 @@
+import cv2
+import numpy as np
+from typing import List, Tuple, Union
+
+def fill_small_holes(depth_img: np.ndarray, area_thresh: int) -> np.ndarray:
+    """
+    Identifies regions in the depth image that have a value of 0 and fills them in
+    with 1 if the region is smaller than a given area threshold.
+
+    Args:
+        depth_img (np.ndarray): The input depth image
+        area_thresh (int): The area threshold for filling in holes
+
+    Returns:
+        np.ndarray: The depth image with small holes filled in
+    """
+    # Create a binary image where holes are 1 and the rest is 0
+    binary_img = np.where(depth_img == 0, 1, 0).astype("uint8")
+
+    # Find contours in the binary image
+    contours, _ = cv2.findContours(binary_img, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+
+    filled_holes = np.zeros_like(binary_img)
+
+    for cnt in contours:
+        # If the area of the contour is smaller than the threshold
+        if cv2.contourArea(cnt) < area_thresh:
+            # Fill the contour
+            cv2.drawContours(filled_holes, [cnt], 0, 1, -1)
+
+    # Create the filled depth image
+    filled_depth_img = np.where(filled_holes == 1, 1, depth_img)
+
+    return filled_depth_img
+
+class MP3DGTPerception:
+    def __init__(self, max_depth, min_depth, fx, fy):
+        self.max_depth = max_depth
+        self.min_depth = min_depth
+        self.fx = fx
+        self.fy = fy
+
+    def predict(self, depth, targets, tf_camera_to_ply, area_threshold=2500):
+        '''
+        Get the gt semantic map of the target objects
+        image: (H, W, 3) current rgb frame
+        depth: (H, W) current depth frame
+        targets: (N, 6) bboxes of the target objects, first 3 are coordinates of min corner, last 3 are coordinates of max corner
+        area_threshold: int
+        return: (N, H, W) gt semantic map of the target objects
+        '''
+        # get the point clouds of current frame
+        filled_depth = fill_small_holes(depth, area_threshold)
+        scaled_depth = filled_depth * (self.max_depth - self.min_depth) + self.min_depth
+        mask = scaled_depth < self.max_depth
+        point_cloud_camera_frame = get_point_cloud(scaled_depth, mask, self.fx, self.fy)
+        point_cloud_ply_frame = transform_points(tf_camera_to_ply, point_cloud_camera_frame)
+
+        # mark the points in the target objects' bboxes
+        semantic_images = []
+        for target in targets:
+            min_x, min_y, min_z = target[:3]
+            max_x, max_y, max_z = target[3:]
+
+            in_bbox = (
+                (point_cloud_ply_frame[:, 0] >= min_x) & 
+                (point_cloud_ply_frame[:, 0] <= max_x) &
+                (point_cloud_ply_frame[:, 1] >= min_y) & 
+                (point_cloud_ply_frame[:, 1] <= max_y) &
+                (point_cloud_ply_frame[:, 2] >= min_z) & 
+                (point_cloud_ply_frame[:, 2] <= max_z)
+            )
+            in_bbox_points = point_cloud_ply_frame[in_bbox]
+            semantic_image = np.zeros(depth.shape, dtype=np.uint8)
+            if len(in_bbox_points) > 0:
+                # map the marked points back to the image to get the semantic map
+                in_bbox_camera_frame = inverse_transform_points(tf_camera_to_ply, in_bbox_points)
+                in_box_image_coords = project_points_to_image(in_bbox_camera_frame, self.fx, self.fy, depth.shape)
+                try:
+                    mask = [in_box_image_coords[i, 0] < 480 and in_box_image_coords[i, 1] < 640 for i in range(len(in_box_image_coords))]
+                    in_box_image_coords = in_box_image_coords[mask]
+                    semantic_image[in_box_image_coords[:, 0], in_box_image_coords[:, 1]] = 1
+                except:
+                    import ipdb; ipdb.set_trace()
+                    print()
+                semantic_image = fill_small_holes(semantic_image, area_threshold)
+            semantic_images.append(semantic_image)
+        if len(semantic_images) > 0:
+            semantic_images = np.stack(semantic_images, axis=0)
+        else:
+            semantic_images = np.zeros((1, depth.shape[0], depth.shape[1]), dtype=np.uint8)
+        return semantic_images
+
+def transform_points(transformation_matrix: np.ndarray, points: np.ndarray) -> np.ndarray:
+    # Add a homogeneous coordinate of 1 to each point for matrix multiplication
+    homogeneous_points = np.hstack((points, np.ones((points.shape[0], 1))))
+
+    # Apply the transformation matrix to the points
+    transformed_points = np.dot(transformation_matrix, homogeneous_points.T).T
+
+    # Remove the added homogeneous coordinate and divide by the last coordinate
+    return transformed_points[:, :3] / transformed_points[:, 3:]
+
+
+def get_point_cloud(depth_image: np.ndarray, mask: np.ndarray, fx: float, fy: float) -> np.ndarray:
+    """Calculates the 3D coordinates (x, y, z) of points in the depth image based on
+    the horizontal field of view (HFOV), the image width and height, the depth values,
+    and the pixel x and y coordinates.
+
+    Args:
+        depth_image (np.ndarray): 2D depth image.
+        mask (np.ndarray): 2D binary mask identifying relevant pixels.
+        fx (float): Focal length in the x direction.
+        fy (float): Focal length in the y direction.
+
+    Returns:
+        np.ndarray: Array of 3D coordinates (x, y, z) of the points in the image plane.
+    """
+    v, u = np.where(mask)
+    z = depth_image[v, u]
+    x = (u - depth_image.shape[1] // 2) * z / fx
+    y = (v - depth_image.shape[0] // 2) * z / fy
+    cloud = np.stack((x, -y, -z), axis=-1)
+
+    return cloud
+
+def inverse_transform_points(transformation_matrix: np.ndarray, points: np.ndarray) -> np.ndarray:
+    """Convert point cloud from episodic coordinate system to camera coordinate system
+
+    Args:
+        transformation_matrix (np.ndarray): 4x4 transformation matrix
+        points (np.ndarray): Point cloud coordinates (N, 3)
+
+    Returns:
+        np.ndarray: Point cloud coordinates in camera coordinate system (N, 3)
+    """
+    # Calculate the inverse of the transformation matrix
+    inv_matrix = np.linalg.inv(transformation_matrix)
+
+    # Add a homogeneous coordinate of 1 to each point for matrix multiplication
+    homogeneous_points = np.hstack((points, np.ones((points.shape[0], 1))))
+
+    # Apply the inverse transformation
+    transformed_points = np.dot(inv_matrix, homogeneous_points.T).T
+
+    # Remove the added homogeneous coordinate
+    return transformed_points[:, :3] / transformed_points[:, 3:]
+
+def project_points_to_image(points: np.ndarray, fx: float, fy: float, image_shape: tuple) -> np.ndarray:
+    """Project points from camera coordinate system to image plane
+
+    Args:
+        points (np.ndarray): Points in camera coordinate system (N, 3)
+        fx (float): x-axis focal length
+        fy (float): y-axis focal length
+        image_shape (tuple): Image dimensions (height, width)
+
+    Returns:
+        np.ndarray: Image coordinates (N, 2)
+    """
+    points = np.stack((points[:,0], -points[:,1], -points[:,2]), axis=-1)
+    # Ensure points are in front of the camera
+    valid_mask = points[:, 2] > 0  # z > 0
+
+    # Calculate image coordinates
+    u = points[:, 0] * fx / points[:, 2] + image_shape[1] // 2
+    v = points[:, 1] * fy / points[:, 2] + image_shape[0] // 2
+
+    # Combine coordinates
+    image_coords = np.stack((v, u), axis=-1)
+    image_coords = image_coords.astype(np.int32)
+    # Return valid points only
+    return image_coords[valid_mask]
diff --git a/internnav/env/habitat_env.py b/internnav/env/habitat_env.py
@@ -0,0 +1,163 @@
+import json
+import os
+import quaternion
+import numpy as np
+from typing import Any, Dict, List, Optional
+
+from habitat.config.default import get_agent_config
+
+from depth_camera_filtering import filter_depth
+from internnav.configs.evaluator import EnvCfg, TaskCfg
+from internnav.env import base
+from .dialog_mp3d import MP3DGTPerception
+
+@base.Env.register('habitat')
+class HabitatEnv(base.Env):
+    def __init__(self, env_config: EnvCfg, task_config: TaskCfg= None):
+        """
+        env_settings include:
+            - habitat_config: loaded from get_habitat_config
+            - rank: int, rank index for sharding
+            - world_size: int, total number of ranks
+        """
+        try:
+            from habitat import Env
+        except ImportError as e:
+            raise RuntimeError(
+                "Habitat modules could not be imported. " "Make sure both repositories are installed and on PYTHONPATH."
+            ) from e
+
+        super().__init__(env_config, task_config)
+        self.config = env_config.env_settings['habitat_config']
+        self._env = Env(self.config)
+
+        self.rank = env_config.env_settings.get('rank', 0)
+        self.world_size = env_config.env_settings.get('world_size', 1)
+        self._current_episode_index: int = 0
+        self._last_obs: Optional[Dict[str, Any]] = None
+
+        self.is_running = True
+        self.output_path = env_config.env_settings.get('output_path', './output')
+
+        agent_config = get_agent_config(self.config.simulator)
+        self.min_depth = agent_config.sim_sensors.depth_sensor.min_depth
+        self.max_depth = agent_config.sim_sensors.depth_sensor.max_depth
+        self._camera_fov = np.deg2rad(agent_config.sim_sensors.depth_sensor.hfov)
+        self._fx = self._fy = agent_config.sim_sensors.depth_sensor.width / (2 * np.tan(self._camera_fov / 2))
+        self._camera_height = agent_config.sim_sensors.rgb_sensor.position[1]
+        self.segmentation = MP3DGTPerception(self.max_depth, self.min_depth, self._fx, self._fy)
+
+        # generate episodes
+        # self._env.episodes = self._env.episodes[0:1]  # for debug
+        self.episodes = self.generate_episodes()
+        # print(self.episodes)
+
+    def generate_episodes(self) -> List[Any]:
+        """
+        Generate list of episodes for the current split, already:
+        - grouped by scene
+        - filtered by done_res (the path is self.output_path/progress.json)
+        - sharded by (rank, world_size)
+        """
+        all_episodes = []
+
+        # group episodes by scene
+        scene_episode_dict: Dict[str, List[Any]] = {}
+        for episode in self._env.episodes:
+            scene_episode_dict.setdefault(episode.scene_id, []).append(episode)
+
+        # load done_res
+        done_res = set()
+        result_path = os.path.join(self.output_path, 'progress.json')
+        if os.path.exists(result_path):
+            with open(result_path, 'r') as f:
+                for line in f:
+                    res = json.loads(line)
+                    # only skip if current format has scene_id
+                    if "scene_id" in res:
+                        done_res.add((res["scene_id"], res["episode_id"]))
+
+        # iterate scenes in order, collect all episodes
+        for scene in sorted(scene_episode_dict.keys()):
+            per_scene_eps = scene_episode_dict[scene]
+            scene_id = scene.split('/')[-2]
+
+            # shard by rank index / world_size
+            for episode in per_scene_eps[self.rank :: self.world_size]:
+                episode_id = int(episode.episode_id)
+                if (scene_id, episode_id) in done_res:
+                    continue
+                all_episodes.append(episode)
+
+        return all_episodes
+
+    def reset(self):
+        """
+        load next episode and return first observation
+        """
+        # no more episodes
+        if not (0 <= self._current_episode_index < len(self.episodes)):
+            self.is_running = False
+            return
+
+        # Manually set to next episode in habitat
+        self._env.current_episode = self.episodes[self._current_episode_index]
+        self._current_episode_index += 1
+
+        # Habitat reset
+        self._last_obs = self._env.reset()
+        if "instance" in self.task_config.task_name:
+            self._last_obs['semantic'] = self.get_semantic(self._last_obs)
+        return self._last_obs
+
+    def step(self, action: List[Any]):
+        """
+        step the environment with given action
+
+        Args: action: List[Any], action for each env in the batch
+
+        Return: obs, reward, done, info
+        """
+        obs = self._env.step(action)
+        if "instance" in self.task_config.task_name:
+            obs['semantic'] = self.get_semantic(obs)
+        done = self._env.episode_over
+        info = self._env.get_metrics()
+        reward = info.get('reward', 0.0)
+        return obs, reward, done, info
+
+    def close(self):
+        print('Habitat Env close')
+        self._env.close()
+
+    def render(self):
+        self._env.render()
+
+    def get_observation(self) -> Dict[str, Any]:
+        return self._env.get_observations()
+
+    def get_metrics(self) -> Dict[str, Any]:
+        return self._env.get_metrics()
+
+    def get_current_episode(self):
+        return self._env.current_episode
+
+    def get_tf_episodic_to_global(self):
+        agent_state = self._env.sim.get_agent_state()
+        rotation = agent_state.rotation
+        translation = agent_state.position
+        rotation_matrix = quaternion.as_rotation_matrix(rotation)
+        tf_episodic_to_global = np.eye(4)
+        tf_episodic_to_global[:3, :3] = rotation_matrix
+        tf_episodic_to_global[:3, 3] = translation
+        return tf_episodic_to_global
+
+    def get_semantic(self, obs: dict):
+        targets = [self.get_current_episode().goals[idx].bbox for idx, _ in enumerate(self.get_current_episode().instruction.instance_id)]
+        targets = np.array([[target[0], min(-target[2], -target[5]) , target[1], target[3], max(-target[5], -target[2]), target[4]] for target in targets])
+        depth = filter_depth(obs["depth"].reshape(obs["depth"].shape[:2]), blur_type=None)
+        tf_camera_to_global = self.get_tf_episodic_to_global()
+        tf_camera_to_global[1, 3] = self._camera_height + self._env.sim.get_agent_state().position[1]
+        tf_camera_to_ply = np.dot(np.array([[1, 0, 0, 0], [0, 0, -1, 0], [0, 1, 0, 0], [0, 0, 0, 1]]), tf_camera_to_global)
+        semantic = self.segmentation.predict(depth, targets, tf_camera_to_ply)
+        return semantic