Spaces:

mknolan
/

model-loading-diagnostic

Runtime error

App Files Files Community

model-loading-diagnostic / debug_model_loading.py

mknolan

Upload debug_model_loading.py with huggingface_hub

de08500 verified 9 months ago

raw

history blame contribute delete

3.85 kB

	import torch
	import os
	import sys
	import traceback
	import requests
	import json
	import platform

	print("=" * 50)
	print("DETAILED MODEL LOADING DIAGNOSTIC")
	print("=" * 50)

	# System information
	print("\n1. SYSTEM INFORMATION:")
	print(f"Python version: {sys.version}")
	print(f"PyTorch version: {torch.__version__}")
	print(f"Platform: {platform.platform()}")
	print(f"Processor: {platform.processor()}")

	# Environment variables
	print("\n2. ENVIRONMENT VARIABLES:")
	relevant_vars = ["CUDA_VISIBLE_DEVICES", "NVIDIA_VISIBLE_DEVICES", "TRANSFORMERS_CACHE", "HF_HOME"]
	for var in relevant_vars:
	print(f"{var}: {os.environ.get(var, 'Not set')}")

	# GPU information
	print("\n3. GPU DETECTION:")
	print(f"CUDA available: {torch.cuda.is_available()}")
	if torch.cuda.is_available():
	try:
	print(f"CUDA version: {torch.version.cuda}")
	print(f"GPU count: {torch.cuda.device_count()}")
	for i in range(torch.cuda.device_count()):
	print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

	# Test GPU with a simple operation
	print("\nTesting GPU with tensor operations...")
	test_tensor = torch.rand(1000, 1000, device="cuda")
	start = torch.cuda.Event(enable_timing=True)
	end = torch.cuda.Event(enable_timing=True)

	start.record()
	result = torch.matmul(test_tensor, test_tensor)
	end.record()

	torch.cuda.synchronize()
	print(f"GPU tensor operation completed in {start.elapsed_time(end):.2f} ms")

	# Memory info
	print(f"\nTotal GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
	print(f"Allocated GPU memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
	print(f"Reserved GPU memory: {torch.cuda.memory_reserved() / 1e9:.2f} GB")

	except Exception as e:
	print(f"Error testing GPU: {str(e)}")
	traceback.print_exc()
	else:
	print("CUDA is not available. This is a critical issue for model loading.")

	# HuggingFace hub connectivity
	print("\n4. HUGGINGFACE HUB CONNECTIVITY:")
	try:
	print("Testing connection to HuggingFace Hub...")
	response = requests.get("https://huggingface.co/api/models/OpenGVLab/InternViT-6B-224px")
	if response.status_code == 200:
	print("Successfully connected to HuggingFace Hub")
	model_info = response.json()
	print(f"Model exists: OpenGVLab/InternViT-6B-224px")
	if 'downloads' in model_info:
	print(f"Downloads: {model_info['downloads']}")
	else:
	print(f"Failed to connect to HuggingFace Hub: Status code {response.status_code}")
	print(response.text)
	except Exception as e:
	print(f"Error connecting to HuggingFace Hub: {str(e)}")
	traceback.print_exc()

	# Attempt model loading with detailed error capture
	print("\n5. ATTEMPTING MODEL LOADING:")
	try:
	print("Importing transformers...")
	from transformers import AutoModel, AutoProcessor
	print("✓ Transformers imported successfully")

	print("\nLoading AutoProcessor...")
	processor = AutoProcessor.from_pretrained("OpenGVLab/InternViT-6B-224px")
	print("✓ AutoProcessor loaded successfully")

	print("\nLoading AutoModel...")
	model = AutoModel.from_pretrained("OpenGVLab/InternViT-6B-224px")
	print("✓ AutoModel loaded successfully")

	if torch.cuda.is_available():
	print("\nMoving model to CUDA...")
	model = model.to("cuda")
	print("✓ Model moved to CUDA successfully")

	print("\nModel loading SUCCESSFUL")
	print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

	except Exception as e:
	print(f"\n❌ ERROR LOADING MODEL: {str(e)}")
	print("\nDetailed traceback:")
	traceback.print_exc()

	print("\n" + "=" * 50)
	print("DIAGNOSTIC COMPLETE")
	print("=" * 50)