Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion internnav/dataset/navdp_dataset_lerobot.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def __init__(
image_size=224,
scene_data_scale=1.0,
trajectory_data_scale=1.0,
pixel_channel=7,
debug=False,
preload=False,
random_digit=False,
Expand All @@ -61,6 +62,7 @@ def __init__(
self.trajectory_afford_path = []
self.random_digit = random_digit
self.prior_sample = prior_sample
self.pixel_channel = pixel_channel
self.item_cnt = 0
self.batch_size = batch_size
self.batch_time_sum = 0.0
Expand Down Expand Up @@ -509,7 +511,12 @@ def __getitem__(self, index):
camera_intrinsic,
trajectory_base_extrinsic,
)
pixel_goal = np.concatenate((pixel_goal, memory_images[-1]), axis=-1)
# pixel channel == 7 represents the navdp works pixel navigation under asynchronous pace,
# pixel_mask (1), the history image with the assigned pixel goal (3), current image (3)
# if pixel_channel == 4, pixel goal is assigned at current frame, therefore,
# only pixel_mask (1) and current image (3) are needed
if self.pixel_channel == 7:
pixel_goal = np.concatenate((pixel_goal, memory_images[-1]), axis=-1)

pred_actions = (pred_actions[1:] - pred_actions[:-1]) * 4.0
augment_actions = (augment_actions[1:] - augment_actions[:-1]) * 4.0
Expand Down
46 changes: 16 additions & 30 deletions internnav/model/basemodel/navdp/navdp_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,13 @@

from internnav.configs.model.base_encoders import ModelCfg
from internnav.configs.trainer.exp import ExpCfg
from internnav.model.encoder.navdp_backbone import *
from internnav.model.encoder.navdp_backbone import (
ImageGoalBackbone,
LearnablePositionalEncoding,
PixelGoalBackbone,
RGBDBackbone,
SinusoidalPosEmb,
)


class NavDPModelConfig(PretrainedConfig):
Expand Down Expand Up @@ -51,9 +57,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
elif pretrained_model_name_or_path is None or len(pretrained_model_name_or_path) == 0:
pass
else:
incompatible_keys, _ = model.load_state_dict(
torch.load(pretrained_model_name_or_path)['state_dict'], strict=False
)
incompatible_keys, _ = model.load_state_dict(torch.load(pretrained_model_name_or_path), strict=False)
if len(incompatible_keys) > 0:
print(f'Incompatible keys: {incompatible_keys}')

Expand All @@ -66,31 +70,32 @@ def __init__(self, config: NavDPModelConfig):
self.model_config = ModelCfg(**config.model_cfg['model'])
else:
self.model_config = config

self.config.model_cfg['il']

self._device = torch.device(f"cuda:{config.model_cfg['local_rank']}")
self.image_size = self.config.model_cfg['il']['image_size']
self.memory_size = self.config.model_cfg['il']['memory_size']
self.predict_size = self.config.model_cfg['il']['predict_size']
self.pixel_channel = self.config.model_cfg['il']['pixel_channel']
self.temporal_depth = self.config.model_cfg['il']['temporal_depth']
self.attention_heads = self.config.model_cfg['il']['heads']
self.input_channels = self.config.model_cfg['il']['channels']
self.dropout = self.config.model_cfg['il']['dropout']
self.token_dim = self.config.model_cfg['il']['token_dim']
self.scratch = self.config.model_cfg['il']['scratch']
self.finetune = self.config.model_cfg['il']['finetune']
self.rgbd_encoder = NavDP_RGBD_Backbone(
self.rgbd_encoder = RGBDBackbone(
self.image_size, self.token_dim, memory_size=self.memory_size, finetune=self.finetune, device=self._device
)
self.pixel_encoder = NavDP_PixelGoal_Backbone(self.image_size, self.token_dim, device=self._device)
self.image_encoder = NavDP_ImageGoal_Backbone(self.image_size, self.token_dim, device=self._device)
self.pixel_encoder = PixelGoalBackbone(
self.image_size, self.token_dim, pixel_channel=self.pixel_channel, device=self._device
)
self.image_encoder = ImageGoalBackbone(self.image_size, self.token_dim, device=self._device)
self.point_encoder = nn.Linear(3, self.token_dim)

if not self.finetune:
for p in self.rgbd_encoder.parameters():
for p in self.rgbd_encoder.rgb_model.parameters():
p.requires_grad = False
self.rgbd_encoder.eval()
self.rgbd_encoder.rgb_model.eval()

decoder_layer = nn.TransformerDecoderLayer(
d_model=self.token_dim,
Expand Down Expand Up @@ -180,23 +185,6 @@ def predict_critic(self, predict_trajectory, rgbd_embed):
return critic_output

def forward(self, goal_point, goal_image, goal_pixel, input_images, input_depths, output_actions, augment_actions):
# """get device safely"""
# # get device safely
# try:
# # try to get device through model parameters
# device = next(self.parameters()).device
# except StopIteration:
# # model has no parameters, use the default device
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# # move all inputs to model device
# goal_point = goal_point.to(device)
# goal_image = goal_image.to(device)
# input_images = input_images.to(device)
# input_depths = input_depths.to(device)
# output_actions = output_actions.to(device)
# augment_actions = augment_actions.to(device)
# device = self._device
# print(f"self.parameters() is:{self.parameters()}")
device = next(self.parameters()).device

assert input_images.shape[1] == self.memory_size
Expand Down Expand Up @@ -325,7 +313,6 @@ def predict_pointgoal_batch_action_vel(self, goal_point, input_images, input_dep
naction = self.noise_scheduler.step(model_output=noise_pred, timestep=k, sample=naction).prev_sample

critic_values = self.predict_critic(naction, rgbd_embed)
all_trajectory = torch.cumsum(naction / 4.0, dim=1)

negative_trajectory = torch.cumsum(naction / 4.0, dim=1)[(critic_values).argsort()[0:8]]
positive_trajectory = torch.cumsum(naction / 4.0, dim=1)[(-critic_values).argsort()[0:8]]
Expand All @@ -344,7 +331,6 @@ def predict_nogoal_batch_action_vel(self, input_images, input_depths, sample_num
naction = self.noise_scheduler.step(model_output=noise_pred, timestep=k, sample=naction).prev_sample

critic_values = self.predict_critic(naction, rgbd_embed)
all_trajectory = torch.cumsum(naction / 4.0, dim=1)

negative_trajectory = torch.cumsum(naction / 4.0, dim=1)[(critic_values).argsort()[0:8]]
positive_trajectory = torch.cumsum(naction / 4.0, dim=1)[(-critic_values).argsort()[0:8]]
Expand Down
10 changes: 5 additions & 5 deletions internnav/model/encoder/navdp_backbone.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ def forward(self, images, depths):
return memory_token


class NavDP_RGBD_Backbone(nn.Module):
class RGBDBackbone(nn.Module):
def __init__(
self,
image_size=224,
Expand Down Expand Up @@ -313,7 +313,7 @@ def _get_device(self):
return torch.device("cuda" if torch.cuda.is_available() else "cpu")


class NavDP_ImageGoal_Backbone(nn.Module):
class ImageGoalBackbone(nn.Module):
def __init__(self, image_size=224, embed_size=512, device='cuda:0'):
super().__init__()
if device is None:
Expand Down Expand Up @@ -376,8 +376,8 @@ def _get_device(self):
return torch.device("cuda" if torch.cuda.is_available() else "cpu")


class NavDP_PixelGoal_Backbone(nn.Module):
def __init__(self, image_size=224, embed_size=512, device='cuda:0'):
class PixelGoalBackbone(nn.Module):
def __init__(self, image_size=224, embed_size=512, pixel_channel=7, device='cuda:0'):
super().__init__()
if device is None:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Expand All @@ -392,7 +392,7 @@ def __init__(self, image_size=224, embed_size=512, device='cuda:0'):
self.pixelgoal_encoder = DepthAnythingV2(**model_configs['vits'])
self.pixelgoal_encoder = self.pixelgoal_encoder.pretrained.float()
self.pixelgoal_encoder.patch_embed.proj = nn.Conv2d(
in_channels=7,
in_channels=pixel_channel,
out_channels=self.pixelgoal_encoder.patch_embed.proj.out_channels,
kernel_size=self.pixelgoal_encoder.patch_embed.proj.kernel_size,
stride=self.pixelgoal_encoder.patch_embed.proj.stride,
Expand Down
3 changes: 2 additions & 1 deletion scripts/train/configs/navdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
),
il=IlCfg(
epochs=1000,
batch_size=16,
batch_size=32,
lr=1e-4,
num_workers=8,
weight_decay=1e-4, # TODO
Expand Down Expand Up @@ -57,6 +57,7 @@
prior_sample=False,
memory_size=8,
predict_size=24,
pixel_channel=4,
temporal_depth=16,
heads=8,
token_dim=384,
Expand Down
5 changes: 3 additions & 2 deletions scripts/train/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,12 @@ def main(config, model_class, model_config_class):
"""Main training function."""
_make_dir(config)

print(f"=== Start training ===")
print("=== Start training ===")
print(f"Current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device count: {torch.cuda.device_count()}")
print(f"Environment variables:")
print("Environment variables:")
print(f" RANK: {os.getenv('RANK', 'Not set')}")
print(f" LOCAL_RANK: {os.getenv('LOCAL_RANK', 'Not set')}")
print(f" WORLD_SIZE: {os.getenv('WORLD_SIZE', 'Not set')}")
Expand Down Expand Up @@ -172,6 +172,7 @@ def main(config, model_class, model_config_class):
config.il.batch_size,
config.il.image_size,
config.il.scene_scale,
pixel_channel=config.il.pixel_channel,
preload=config.il.preload,
random_digit=config.il.random_digit,
prior_sample=config.il.prior_sample,
Expand Down