Skip to content

Loss and accuracy doesn't change when trying to run the GG experiment on a single custom dataset #29

@appledora

Description

@appledora

From a pickle file, I load my custom dataloaders for a particular dataset like below:

train_loader = dataloader_dict[TASK_NAME]["train"]
val_loader = dataloader_dict[TASK_NAME]["val"]
test_loader = dataloader_dict[TASK_NAME]["test"]
print(f"Train size: {len(train_loader.dataset)}, Val size: {len(val_loader.dataset)}, Test size: {len(test_loader.dataset)}")
best_acc1 = [0.0 for _ in range(num_tasks+1)]
curr_acc1 = [0.0 for _ in range(num_tasks+1)]
adapt_acc1 = [0.0 for _ in range(num_tasks+1)]

I have slightly modified the resnet.py to accept a num_class property during initialization. So my training setup looks like this:

num_tasks = 1 
task_idx = 1
criterion = nn.CrossEntropyLoss()
model = utils.get_model("ResNet34", NUM_CLASSES)
CONFIG.device = device
CONFIG.output_size = NUM_CLASSES
model = model.to(device)
print(device)
model.apply(lambda x: setattr(x, "task", task_idx))
params = []
param_count = 0
for name, param in model.named_parameters():
    if not param.requires_grad: continue
    param_count += param.numel()
    split = name.split(".")
    if split[-1] in ["scores", "s", "t"]:
        params.append(param)
lr = 0.1
optimizer = torch.optim.Adam(params, lr=lr, weight_decay=0.0001)
train_epochs = 250
scheduler = CosineAnnealingLR(optimizer, train_epochs)

These are my training and eval code:

import tqdm
def train(model, writer, train_loader, optimizer, criterion, epoch, task_idx, data_loader=None):
    model.zero_grad()
    model.train()

    num_correct = 0
    total_seen = 0
    for batch_idx, (data, target) in tqdm.tqdm(enumerate(train_loader), desc = "TRAIN"):
        optimizer.zero_grad()
        data = data.to(device)
        target = target.to(device)
        output = model(data)
        loss = criterion(output, target)
        predictions = output.data.max(1, keepdim=True)[1]
        num_correct += predictions.eq(target.data.view_as(predictions)).sum()
        total_seen += target.size(0)
        loss.backward()
        optimizer.step()

        if batch_idx % 10 == 0:
            print(
                f"Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} "
                f"({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}"
            )

@torch.no_grad()
def evaluate(model, val_loader, epoch):
    model.eval()
    num_correct = 0
    total_seen = 0
    for id, (batch, labels) in tqdm.tqdm(enumerate(val_loader), desc = "EVAL"):
        batch = batch.to(device)
        labels = labels.to(device)
        logits = model(batch)
        predictions = logits.argmax(dim=-1)
        num_correct += (predictions == labels).float().sum()
        total_seen += logits.size(0) 
    

    print(f"Val Perf after {epoch + 1} epochs Acc@1 {(num_correct / total_seen):0.4f}")
    return num_correct / total_seen

I am using the config values from the rn50-supsup-adam.yaml file. The args are also set up accordingly.
However, no matter what even after 40-50 epochs, there's no stable change in loss or accuracy. What am I doing wrong here?
Additionally, for this single dataset I am using the following module types:

    conv_type="MaskConv",
    bn_type="NonAffineBN",
    conv_init="signed_constant",

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions