Spaces:
Sleeping
Sleeping
| import boto3 | |
| from boto3.s3.transfer import TransferConfig | |
| from tqdm import tqdm | |
| import os | |
| def upload_file_to_s3(file_path, bucket_name, s3_prefix): | |
| class ProgressPercentage(object): | |
| def __init__(self, filename): | |
| self._filename = filename | |
| self._size = float(os.path.getsize(filename)) | |
| self._seen_so_far = 0 | |
| self._pbar = tqdm(total=self._size, unit='B', unit_scale=True, desc=f"Uploading {os.path.basename(filename)}") | |
| def __call__(self, bytes_amount): | |
| self._seen_so_far += bytes_amount | |
| self._pbar.update(bytes_amount) | |
| s3_client = boto3.client('s3') | |
| file_name = os.path.basename(file_path) | |
| s3_path = f"{s3_prefix}/{file_name}" | |
| # Configure multipart upload | |
| config = TransferConfig( | |
| multipart_threshold=1024 * 25, # 25MB | |
| max_concurrency=10, | |
| multipart_chunksize=1024 * 25, # 25MB | |
| use_threads=True | |
| ) | |
| try: | |
| s3_client.upload_file( | |
| file_path, | |
| bucket_name, | |
| s3_path, | |
| Config=config, | |
| Callback=ProgressPercentage(file_path) | |
| ) | |
| return f"s3://{bucket_name}/{s3_path}" | |
| except Exception as e: | |
| print(f"Failed to upload {file_path} to S3: {str(e)}") | |
| return None | |
| max_lr = 1e-3 | |
| warmup_steps = 10 | |
| max_steps = 25000 | |
| import math | |
| def get_lr_lambda(current_step, warmup_steps, max_steps, max_lr): | |
| """ | |
| Learning rate scheduler with: | |
| 1. Linear warmup | |
| 2. Cosine decay | |
| 3. Minimum learning rate of 10% of max_lr | |
| """ | |
| min_lr = max_lr * 0.1 # Minimum learning rate (10% of max_lr) | |
| if current_step < warmup_steps: | |
| # Linear warmup | |
| return max_lr * (current_step + 1) / warmup_steps | |
| elif current_step > max_steps: | |
| # After max_steps, return minimum learning rate | |
| return min_lr | |
| else: | |
| # Cosine decay between warmup_steps and max_steps | |
| decay_ratio = (current_step - warmup_steps) / (max_steps - warmup_steps) | |
| coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) | |
| return min_lr + coeff * (max_lr - min_lr) | |
| def plot_lr_schedule(): | |
| """ | |
| Helper function to visualize the learning rate schedule | |
| """ | |
| import matplotlib.pyplot as plt | |
| steps = list(range(0, max_steps + 100)) | |
| lrs = [get_lr_lambda(step, warmup_steps, max_steps, max_lr) for step in steps] | |
| plt.figure(figsize=(10, 5)) | |
| plt.plot(steps, lrs) | |
| plt.title('Learning Rate Schedule') | |
| plt.xlabel('Steps') | |
| plt.ylabel('Learning Rate') | |
| plt.grid(True) | |
| plt.show() | |
| def plot_training_loss(log_file_path, output_path=None): | |
| """ | |
| Parse a training log file and plot the running average loss against batch steps. | |
| Also adds a trend line to visualize the overall training progress. | |
| Args: | |
| log_file_path (str): Path to the training log file | |
| output_path (str, optional): Path to save the plot as PNG. If None, displays the plot instead. | |
| """ | |
| import re | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| from scipy.optimize import curve_fit | |
| # Regular expression to extract batch number and loss | |
| pattern = r"Batch (\d+), Running Avg Loss: ([0-9.]+)" | |
| steps = [] | |
| losses = [] | |
| # Read and parse the log file | |
| with open(log_file_path, 'r') as file: | |
| for line in file: | |
| match = re.search(pattern, line) | |
| if match: | |
| batch_num = int(match.group(1)) | |
| loss = float(match.group(2)) | |
| steps.append(batch_num) | |
| losses.append(loss) | |
| if not steps: | |
| print("No loss data found in the log file.") | |
| return | |
| # Create the plot | |
| plt.figure(figsize=(12, 6)) | |
| plt.plot(steps, losses, 'b-', alpha=0.5, label='Running Avg Loss') | |
| # Add trend line (using polynomial fit) | |
| def poly_func(x, a, b, c): | |
| return a * x**2 + b * x + c | |
| # Convert to numpy arrays for curve fitting | |
| x_array = np.array(steps) | |
| y_array = np.array(losses) | |
| # Fit the curve | |
| try: | |
| popt, _ = curve_fit(poly_func, x_array, y_array) | |
| x_line = np.linspace(min(steps), max(steps), 1000) | |
| y_line = poly_func(x_line, *popt) | |
| plt.plot(x_line, y_line, 'r-', label='Trend Line') | |
| except Exception as e: | |
| print(f"Could not fit trend line: {e}") | |
| # Fallback to simple moving average for trend | |
| window_size = min(len(steps) // 10, 100) if len(steps) > 100 else len(steps) // 2 | |
| if window_size > 0: | |
| moving_avg = np.convolve(y_array, np.ones(window_size)/window_size, mode='valid') | |
| plt.plot(steps[window_size-1:], moving_avg, 'r-', label='Moving Average Trend') | |
| # Add labels and title | |
| plt.xlabel('Batch Number') | |
| plt.ylabel('Running Average Loss') | |
| plt.title('Training Loss Over Time') | |
| plt.grid(True) | |
| plt.legend() | |
| # Add min and max loss annotations | |
| min_loss = min(losses) | |
| min_idx = losses.index(min_loss) | |
| max_loss = max(losses) | |
| max_idx = losses.index(max_loss) | |
| plt.annotate(f'Min: {min_loss:.5f}', | |
| xy=(steps[min_idx], min_loss), | |
| xytext=(steps[min_idx], min_loss*1.05), | |
| arrowprops=dict(facecolor='green', shrink=0.05), | |
| fontsize=10) | |
| plt.annotate(f'Max: {max_loss:.5f}', | |
| xy=(steps[max_idx], max_loss), | |
| xytext=(steps[max_idx], max_loss*0.95), | |
| arrowprops=dict(facecolor='red', shrink=0.05), | |
| fontsize=10) | |
| # Save or show the plot | |
| plt.tight_layout() | |
| if output_path: | |
| plt.savefig(output_path, dpi=300, bbox_inches='tight') | |
| print(f"Plot saved to {output_path}") | |
| else: | |
| plt.show() | |
| if __name__ == "__main__": | |
| # plot_lr_schedule() | |
| plot_training_loss("training.log", "train_loss.png") |