Create eval loop and use full train dataset

This commit is contained in:
Craig
2025-04-12 10:55:10 +01:00
parent e9b97ac2b5
commit 0f3a96ca81
3 changed files with 266 additions and 50 deletions

199
train.py
View File

@@ -13,6 +13,7 @@ import torch.utils.data
# Project specific imports
from models.detection import get_maskrcnn_model
from utils.data_utils import PennFudanDataset, collate_fn, get_transform
from utils.eval_utils import evaluate # Import evaluate function
from utils.log_utils import setup_logging
@@ -108,31 +109,75 @@ def main(args):
sys.exit(1)
try:
dataset_train = PennFudanDataset(
# Create the full training dataset instance first
dataset_full = PennFudanDataset(
root=data_root, transforms=get_transform(train=True)
)
# Note: Validation split will be handled later (Prompt 12)
# dataset_val = PennFudanDataset(root=data_root, transforms=get_transform(train=False))
logging.info(f"Full dataset size: {len(dataset_full)}")
# TODO: Implement data splitting (e.g., using torch.utils.data.Subset)
# Create validation dataset instance with eval transforms
dataset_val_instance = PennFudanDataset(
root=data_root, transforms=get_transform(train=False)
)
# Split the dataset indices
torch.manual_seed(
config.get("seed", 42)
) # Use the same seed for consistent splits
indices = torch.randperm(len(dataset_full)).tolist()
val_split_ratio = config.get(
"val_split_ratio", 0.1
) # Default to 10% validation
val_split_count = int(val_split_ratio * len(dataset_full))
if val_split_count == 0 and len(dataset_full) > 0:
logging.warning(
f"Validation split resulted in 0 samples (ratio={val_split_ratio}, total={len(dataset_full)}). Using 1 sample for validation."
)
val_split_count = 1
elif val_split_count >= len(dataset_full):
logging.error(
f"Validation split ratio ({val_split_ratio}) too high, results in no training samples."
)
sys.exit(1)
train_indices = indices[:-val_split_count]
val_indices = indices[-val_split_count:]
# Create Subset datasets
dataset_train = torch.utils.data.Subset(dataset_full, train_indices)
dataset_val = torch.utils.data.Subset(dataset_val_instance, val_indices)
logging.info(
f"Using {len(train_indices)} samples for training and {len(val_indices)} for validation."
)
# Create DataLoaders
data_loader_train = torch.utils.data.DataLoader(
dataset_train,
batch_size=config.get("batch_size", 2),
# Shuffle should be true for the training subset loader
shuffle=True,
num_workers=config.get("num_workers", 4),
collate_fn=collate_fn,
pin_memory=config.get(
"pin_memory", True
), # Often improves GPU transfer speed
pin_memory=config.get("pin_memory", True),
)
logging.info(f"Training dataset size: {len(dataset_train)}")
logging.info(
f"Training dataloader configured with batch size {config.get('batch_size', 2)}"
data_loader_val = torch.utils.data.DataLoader(
dataset_val,
batch_size=config.get(
"batch_size", 2
), # Often use same or larger batch size for validation
shuffle=False, # No need to shuffle validation data
num_workers=config.get("num_workers", 4),
collate_fn=collate_fn,
pin_memory=config.get("pin_memory", True),
)
# Placeholder for validation loader
# data_loader_val = torch.utils.data.DataLoader(...)
logging.info(
f"Training dataloader configured. Est. batches: {len(data_loader_train)}"
)
logging.info(
f"Validation dataloader configured. Est. batches: {len(data_loader_val)}"
)
except Exception as e:
logging.error(f"Error setting up dataset/dataloader: {e}", exc_info=True)
@@ -187,12 +232,72 @@ def main(args):
logging.error(f"Error creating LR scheduler: {e}", exc_info=True)
sys.exit(1)
# --- Training Loop (Prompt 10) ---
# --- Resume Logic (Prompt 11) ---
start_epoch = 0
latest_checkpoint_path = None
if os.path.isdir(checkpoint_path):
checkpoints = sorted(
[f for f in os.listdir(checkpoint_path) if f.endswith(".pth")]
)
if checkpoints: # Check if list is not empty
latest_checkpoint_file = checkpoints[
-1
] # Get the last one (assuming naming convention like epoch_N.pth)
latest_checkpoint_path = os.path.join(
checkpoint_path, latest_checkpoint_file
)
logging.info(f"Found latest checkpoint: {latest_checkpoint_path}")
else:
logging.info("No checkpoints found in directory. Starting from scratch.")
else:
logging.info("Checkpoint directory not found. Starting from scratch.")
if latest_checkpoint_path:
try:
logging.info(f"Loading checkpoint '{latest_checkpoint_path}'")
# Ensure loading happens on the correct device
checkpoint = torch.load(latest_checkpoint_path, map_location=device)
# Load model state - handle potential 'module.' prefix if saved with DataParallel
model_state_dict = checkpoint["model_state_dict"]
# Simple check and correction for DataParallel prefix
if all(key.startswith("module.") for key in model_state_dict.keys()):
logging.info("Removing 'module.' prefix from checkpoint keys.")
model_state_dict = {
k.replace("module.", ""): v for k, v in model_state_dict.items()
}
model.load_state_dict(model_state_dict)
# Load optimizer state
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
# Load LR scheduler state
lr_scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
# Load starting epoch (epoch saved is the one *completed*, so start from next)
start_epoch = checkpoint["epoch"]
logging.info(f"Resuming training from epoch {start_epoch + 1}")
# Optionally load and verify config consistency
# loaded_config = checkpoint.get('config')
# if loaded_config:
# # Perform checks if necessary
# pass
except Exception as e:
logging.error(
f"Error loading checkpoint: {e}. Starting training from scratch.",
exc_info=True,
)
start_epoch = 0 # Reset start_epoch if loading fails
# --- Training Loop (Prompt 10, modified for Prompt 11) ---
logging.info("--- Starting Training Loop --- ")
start_time = time.time()
num_epochs = config.get("num_epochs", 10)
for epoch in range(num_epochs):
# Modify loop to start from start_epoch
for epoch in range(start_epoch, num_epochs):
model.train() # Set model to training mode for each epoch
epoch_start_time = time.time()
logging.info(f"--- Epoch {epoch + 1}/{num_epochs} --- ")
@@ -256,17 +361,63 @@ def main(args):
logging.info(f" Learning Rate: {current_lr:.6f}")
logging.info(f" Epoch Duration: {epoch_duration:.2f}s")
# --- Checkpointing (Placeholder for Prompt 11) --- #
# Add checkpoint saving logic here, e.g.:
# if (epoch + 1) % config.get('checkpoint_freq', 1) == 0 or (epoch + 1) == num_epochs:
# # ... save checkpoint ...
# logging.info(f"Saved checkpoint for epoch {epoch + 1}")
# --- Checkpointing (Prompt 11) --- #
# Save checkpoint periodically or at the end
save_checkpoint = False
if (epoch + 1) % config.get("checkpoint_freq", 1) == 0:
save_checkpoint = True
logging.info(f"Checkpoint frequency met (epoch {epoch + 1})")
elif (epoch + 1) == num_epochs:
save_checkpoint = True
logging.info(f"Final epoch ({epoch + 1}) reached, saving checkpoint.")
# --- Evaluation (Placeholder for Prompt 12) --- #
# Add evaluation logic here, e.g.:
# if data_loader_val:
# evaluate(model, data_loader_val, device)
# logging.info(f"Ran evaluation for epoch {epoch + 1}")
if save_checkpoint:
checkpoint_filename = f"checkpoint_epoch_{epoch + 1}.pth"
save_path = os.path.join(checkpoint_path, checkpoint_filename)
try:
checkpoint_data = {
"epoch": epoch + 1,
"model_state_dict": model.state_dict(),
"optimizer_state_dict": optimizer.state_dict(),
"scheduler_state_dict": lr_scheduler.state_dict(),
"config": config, # Save config for reference
}
torch.save(checkpoint_data, save_path)
logging.info(f"Checkpoint saved to {save_path}")
except Exception as e:
logging.error(
f"Failed to save checkpoint for epoch {epoch + 1} to {save_path}: {e}",
exc_info=True,
)
# --- Evaluation (Prompt 12) --- #
if data_loader_val:
logging.info(f"Starting evaluation for epoch {epoch + 1}...")
try:
val_metrics = evaluate(model, data_loader_val, device)
logging.info(f"Epoch {epoch + 1} Validation Metrics: {val_metrics}")
# --- Best Model Checkpoint Logic (Optional Add-on) ---
# Add logic here to track the best metric (e.g., val_metrics['average_loss'])
# and save a separate 'best_model.pth' checkpoint if the current epoch is better.
# Example:
# if 'average_loss' in val_metrics:
# current_val_loss = val_metrics['average_loss']
# if best_val_loss is None or current_val_loss < best_val_loss:
# best_val_loss = current_val_loss
# best_model_path = os.path.join(output_path, 'best_model.pth')
# try:
# # Save only the model state_dict for the best model
# torch.save(model.state_dict(), best_model_path)
# logging.info(f"Saved NEW BEST model checkpoint to {best_model_path} (Val Loss: {best_val_loss:.4f})")
# except Exception as e:
# logging.error(f"Failed to save best model checkpoint: {e}", exc_info=True)
except Exception as e:
logging.error(
f"Error during evaluation for epoch {epoch + 1}: {e}", exc_info=True
)
# Decide if this error should stop the entire training process
# --- End of Training --- #
total_training_time = time.time() - start_time