Fixed some issues and bugs .Finished trail training succesfully

Browse files

Files changed (14) hide show

Readme.md +0 -0
configs/config.json +3 -2
log/log.txt +3 -0
model_core/__pycache__/__init__.cpython-312.pyc +0 -0
model_core/__pycache__/attention.cpython-311.pyc +0 -0
model_core/__pycache__/model.cpython-311.pyc +0 -0
model_core/__pycache__/training.cpython-311.pyc +0 -0
model_core/__pycache__/training.cpython-312.pyc +0 -0
model_core/attention.py +35 -37
model_core/model.py +9 -1
model_core/training.py +0 -2
requirements.txt2 +0 -0
rough_work.py +0 -0
scripts/generate.py +1 -62

Readme.md CHANGED Viewed

Binary files a/Readme.md and b/Readme.md differ

configs/config.json CHANGED Viewed

@@ -5,13 +5,14 @@
     "n_layer": 12,
     "n_head": 12,
     "n_embd": 768,
-    "n_kv_head": 4
   },
   "training": {
     "max_steps": 19073,
     "log_dir": "log",
     "total_batch_size": 524288,
-    "B": 8,
     "T": 1024,
     "max_lr": 0.0006,
     "min_lr": 0.00006,

     "n_layer": 12,
     "n_head": 12,
     "n_embd": 768,
+    "n_kv_head": 4,
+    "max_knn_memories": 81920
   },
   "training": {
     "max_steps": 19073,
     "log_dir": "log",
     "total_batch_size": 524288,
+    "B": 64,
     "T": 1024,
     "max_lr": 0.0006,
     "min_lr": 0.00006,

log/log.txt CHANGED Viewed

	@@ -0,0 +1,3 @@

+0 val 10.9481 shard_0
+0 train 10.947327 shard_0
+1 train 10.917969 shard_0

model_core/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (162 Bytes). View file

model_core/__pycache__/attention.cpython-311.pyc CHANGED Viewed

Binary files a/model_core/__pycache__/attention.cpython-311.pyc and b/model_core/__pycache__/attention.cpython-311.pyc differ

model_core/__pycache__/model.cpython-311.pyc CHANGED Viewed

Binary files a/model_core/__pycache__/model.cpython-311.pyc and b/model_core/__pycache__/model.cpython-311.pyc differ

model_core/__pycache__/training.cpython-311.pyc CHANGED Viewed

Binary files a/model_core/__pycache__/training.cpython-311.pyc and b/model_core/__pycache__/training.cpython-311.pyc differ

model_core/__pycache__/training.cpython-312.pyc ADDED Viewed

Binary file (11.7 kB). View file

model_core/attention.py CHANGED Viewed

@@ -9,7 +9,7 @@ import torch._dynamo
 class RotaryPositionalEncoding(nn.Module):
     def __init__(self, dim, max_seq_len=2048, base=10000):
         super().__init__()
-        assert dim % 2 == 0
         self.dim = dim
         self.max_seq_len = max_seq_len
@@ -31,36 +31,27 @@ class RotaryPositionalEncoding(nn.Module):
             self._cached_seq_len = seq_len
         return self._cached_freqs[0][:seq_len], self._cached_freqs[1][:seq_len]
-    def apply_rotary_pos_emb(self, q, k):
-        q_len = q.shape[2]
-        k_len = k.shape[2]
-        assert q.shape[-1] == self.dim, f"Expected q.shape[-1] == {self.dim}, got {q.shape[-1]}"
-        assert k.shape[-1] == self.dim, f"Expected k.shape[-1] == {self.dim}, got {k.shape[-1]}"
-        assert q_len <= self.max_seq_len, f"seq_len {q_len} exceeds max_seq_len {self.max_seq_len}"
-        assert k_len <= self.max_seq_len, f"seq_len {k_len} exceeds max_seq_len {self.max_seq_len}"
-        device = q.device
-        cos_q, sin_q = self._get_freqs(q_len, device)  # both [seq_len, dim//2]
-        cos_k, sin_k = self._get_freqs(k_len, device)  # both [seq_len, dim//2]
-        # Expand to match q/k: [1, 1, seq_len, dim//2]
-        cos_q = cos_q[None, None, :, :].expand(q.shape[0], q.shape[1], -1, -1)
-        sin_q = sin_q[None, None, :, :].expand(q.shape[0], q.shape[1], -1, -1)
-        cos_k = cos_k[None, None, :, :].expand(q.shape[0], q.shape[1], -1, -1)
-        sin_k = sin_k[None, None, :, :].expand(q.shape[0], q.shape[1], -1, -1)
-        def apply(x,cos, sin):
-            x1 = x[..., ::2]
-            x2 = x[..., 1::2]
-            x_rotated_even = x1 * cos - x2 * sin
-            x_rotated_odd = x1 * sin + x2 * cos
-            return torch.stack((x_rotated_even, x_rotated_odd), dim=-1).flatten(-2)
-        q_rot = apply(q, cos_q, sin_q)
-        k_rot = apply(k, cos_k, sin_k)
-        return q_rot, k_rot
 class KNN():
     def __init__(self, dim, max_memories, process_rank=0):
@@ -150,18 +141,19 @@ class XLAttention(nn.Module):
         self.n_kv_head = getattr(config, 'n_kv_head', config.n_head)
         self.n_embd = config.n_embd
         self.head_dim = config.n_embd // config.n_head
-        self.kv_head_dim = config.n_embd // self.n_kv_head
         self.group_size = self.n_head // self.n_kv_head
         self.dropout = nn.Dropout(config.dropout if hasattr(config, 'dropout') else 0.0)
         self.scale = self.head_dim ** -0.5
         self.q_proj = nn.Linear(config.n_embd, config.n_embd)
         self.k_proj = nn.Linear(config.n_embd, self.n_kv_head * self.kv_head_dim)
         self.v_proj = nn.Linear(config.n_embd, self.n_kv_head * self.kv_head_dim)
         self.c_proj = nn.Linear(config.n_embd, config.n_embd)
         self.c_proj.MEMGPT_SCALE_INIT = 1
-        self.rope = RotaryPositionalEncoding(self.head_dim)
     def forward(self, x, xl_memory=None):
         B, T, C = x.size()
@@ -184,7 +176,8 @@ class XLAttention(nn.Module):
         # Apply rotary positional encoding
         seq_len = k.shape[2]
-        q, k = self.rope.apply_rotary_pos_emb(q, k)
         k = k.repeat_interleave(self.group_size, dim=1)  # (B, n_head, T+xl, kv_head_dim)
         v = v.repeat_interleave(self.group_size, dim=1)  # (B, n_head, T+xl, kv_head_dim)
@@ -226,7 +219,7 @@ class KNNAttention(nn.Module):
         self.n_kv_head = getattr(config, 'n_kv_head', config.n_head)
         self.n_embd = config.n_embd
         self.head_dim = config.n_embd // config.n_head
-        self.kv_head_dim = config.n_embd // self.n_kv_head
         self.group_size = self.n_head // self.n_kv_head
         self.dropout = nn.Dropout(config.dropout if hasattr(config, 'dropout') else 0.0)
         self.scale = self.head_dim ** -0.5
@@ -241,7 +234,8 @@ class KNNAttention(nn.Module):
         self.topk_retrieved_memories = topk_retrieved_memories
         self.knn = knn
-        self.rope = RotaryPositionalEncoding(self.head_dim)
     def forward(self, x, xl_memory=None):
         B, T, C = x.size()
@@ -265,7 +259,8 @@ class KNNAttention(nn.Module):
         v = v.view(B, -1, self.n_kv_head, self.kv_head_dim).transpose(1, 2)  # (B, n_kv_head, seq_len, kv_head_dim) # GQAchange
         seq_len = k.shape[2]
-        q, k = self.rope.apply_rotary_pos_emb(q, k)
         k_expanded = k.repeat_interleave(self.group_size, dim=1)  # (B, n_head, seq_len, kv_head_dim)
         v_expanded = v.repeat_interleave(self.group_size, dim=1)  # (B, n_head, seq_len, kv_head_dim)
@@ -279,9 +274,12 @@ class KNNAttention(nn.Module):
         local_out = att @ v_expanded
         # KNN ATTENTION
         if self.knn.index.ntotal > 0:
-            q_search = q.transpose(1, 2).contiguous().view(B, T, C)
-            mem_kv = self.knn.search(q_search, topk=self.topk_retrieved_memories)
             mem_k, mem_v = mem_kv.unbind(dim=-2)
             # Reshape memory K,V according to KV head structure

 class RotaryPositionalEncoding(nn.Module):
     def __init__(self, dim, max_seq_len=2048, base=10000):
         super().__init__()
+        assert dim % 2 == 0, f"Dimension {dim} must be even for RoPE"
         self.dim = dim
         self.max_seq_len = max_seq_len
             self._cached_seq_len = seq_len
         return self._cached_freqs[0][:seq_len], self._cached_freqs[1][:seq_len]
+    def apply_rotary_pos_emb(self, x, seq_len=None):
+        if seq_len is None:
+            seq_len = x.shape[2]
+        assert x.shape[-1] == self.dim, f"Expected x.shape[-1] == {self.dim}, got {x.shape[-1]}"
+        assert seq_len <= self.max_seq_len, f"seq_len {seq_len} exceeds max_seq_len {self.max_seq_len}"
+        device = x.device
+        cos, sin = self._get_freqs(seq_len, device)  # both [seq_len, dim//2]
+        # Expand to match x: [1, 1, seq_len, dim//2]
+        cos = cos[None, None, :, :].expand(x.shape[0], x.shape[1], -1, -1)
+        sin = sin[None, None, :, :].expand(x.shape[0], x.shape[1], -1, -1)
+        x1 = x[..., ::2]   # even indices
+        x2 = x[..., 1::2]  # odd indices
+        x_rotated_even = x1 * cos - x2 * sin
+        x_rotated_odd = x1 * sin + x2 * cos
+        return torch.stack((x_rotated_even, x_rotated_odd), dim=-1).flatten(-2)
 class KNN():
     def __init__(self, dim, max_memories, process_rank=0):
         self.n_kv_head = getattr(config, 'n_kv_head', config.n_head)
         self.n_embd = config.n_embd
         self.head_dim = config.n_embd // config.n_head
+        self.kv_head_dim = self.head_dim
         self.group_size = self.n_head // self.n_kv_head
         self.dropout = nn.Dropout(config.dropout if hasattr(config, 'dropout') else 0.0)
         self.scale = self.head_dim ** -0.5
         self.q_proj = nn.Linear(config.n_embd, config.n_embd)
         self.k_proj = nn.Linear(config.n_embd, self.n_kv_head * self.kv_head_dim)
         self.v_proj = nn.Linear(config.n_embd, self.n_kv_head * self.kv_head_dim)
         self.c_proj = nn.Linear(config.n_embd, config.n_embd)
         self.c_proj.MEMGPT_SCALE_INIT = 1
+        self.rope_q = RotaryPositionalEncoding(self.head_dim)
+        self.rope_k = RotaryPositionalEncoding(self.kv_head_dim)
     def forward(self, x, xl_memory=None):
         B, T, C = x.size()
         # Apply rotary positional encoding
         seq_len = k.shape[2]
+        q = self.rope_q.apply_rotary_pos_emb(q)
+        k = self.rope_k.apply_rotary_pos_emb(k)
         k = k.repeat_interleave(self.group_size, dim=1)  # (B, n_head, T+xl, kv_head_dim)
         v = v.repeat_interleave(self.group_size, dim=1)  # (B, n_head, T+xl, kv_head_dim)
         self.n_kv_head = getattr(config, 'n_kv_head', config.n_head)
         self.n_embd = config.n_embd
         self.head_dim = config.n_embd // config.n_head
+        self.kv_head_dim = self.head_dim
         self.group_size = self.n_head // self.n_kv_head
         self.dropout = nn.Dropout(config.dropout if hasattr(config, 'dropout') else 0.0)
         self.scale = self.head_dim ** -0.5
         self.topk_retrieved_memories = topk_retrieved_memories
         self.knn = knn
+        self.rope_q = RotaryPositionalEncoding(self.head_dim)
+        self.rope_k = RotaryPositionalEncoding(self.kv_head_dim)
     def forward(self, x, xl_memory=None):
         B, T, C = x.size()
         v = v.view(B, -1, self.n_kv_head, self.kv_head_dim).transpose(1, 2)  # (B, n_kv_head, seq_len, kv_head_dim) # GQAchange
         seq_len = k.shape[2]
+        q = self.rope_q.apply_rotary_pos_emb(q)
+        k = self.rope_k.apply_rotary_pos_emb(k)
         k_expanded = k.repeat_interleave(self.group_size, dim=1)  # (B, n_head, seq_len, kv_head_dim)
         v_expanded = v.repeat_interleave(self.group_size, dim=1)  # (B, n_head, seq_len, kv_head_dim)
         local_out = att @ v_expanded
         # KNN ATTENTION
+        #Making some modifications to the query shape for searching in the db, which is different from the original paper.
         if self.knn.index.ntotal > 0:
+            q_grouped = q.view(B, self.n_kv_head, self.group_size, T, self.head_dim) #(B, n_head, T, head_dim) -> (B, n_kv_head, group_size, T, head_dim)
+            q_mean = q_grouped.mean(dim=2)  # (B, 4, T, 64) , took average across the 3 heads in each group
+            q_knn = q_mean.transpose(1, 2).contiguous().view(B, T, -1)  # (B, T, 256)
+            mem_kv = self.knn.search(q_knn, topk=self.topk_retrieved_memories)
             mem_k, mem_v = mem_kv.unbind(dim=-2)
             # Reshape memory K,V according to KV head structure

model_core/model.py CHANGED Viewed

@@ -52,9 +52,10 @@ class GPT(nn.Module):
         super().__init__()
         self.config = config
         self.process_rank = process_rank
         # Initialize KNN memory
-        self.knn = KNN(config.n_embd, config.max_knn_memories, process_rank)
         self.transformer = nn.ModuleDict(dict(
             wte=nn.Embedding(config.vocab_size, config.n_embd),
@@ -120,6 +121,13 @@ class GPT(nn.Module):
           return logits, loss
     def configure_optimizers(self, weight_decay, learning_rate, device_type, master_process):
         # Get all parameters that require grad
         param_dict = {pn: p for pn, p in self.named_parameters()}
         param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}

         super().__init__()
         self.config = config
         self.process_rank = process_rank
+        kv_dim = config.n_kv_head * (config.n_embd // config.n_head)
         # Initialize KNN memory
+        self.knn = KNN(kv_dim, config.max_knn_memories, process_rank)
         self.transformer = nn.ModuleDict(dict(
             wte=nn.Embedding(config.vocab_size, config.n_embd),
           return logits, loss
     def configure_optimizers(self, weight_decay, learning_rate, device_type, master_process):
+        #print model parameters
+        total_params = sum(p.numel() for p in self.parameters())
+        print(f"Total parameters: {total_params}")
+        # Trainable parameters
+        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        print(f"Trainable parameters: {trainable_params}")
         # Get all parameters that require grad
         param_dict = {pn: p for pn, p in self.named_parameters()}
         param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}

model_core/training.py CHANGED Viewed

@@ -103,7 +103,6 @@ def train_memgpt(config_path,dataloader_class=None):
     for step in range(max_steps):
         t0 = time.time()
         last_step = (step == max_steps - 1)
-        print(f"validation loop.step={step}")
         if step % 350 == 0 or last_step:
             model.eval()
             val_loader.reset()
@@ -168,7 +167,6 @@ def train_memgpt(config_path,dataloader_class=None):
         loss_accum = 0.0
         for micro_step in range(grad_accum_steps):
-            print(f"micro tep= {micro_step}")
             x, y, current_shard_num = train_loader.next_batch()
             x, y = x.to(device), y.to(device)

     for step in range(max_steps):
         t0 = time.time()
         last_step = (step == max_steps - 1)
         if step % 350 == 0 or last_step:
             model.eval()
             val_loader.reset()
         loss_accum = 0.0
         for micro_step in range(grad_accum_steps):
             x, y, current_shard_num = train_loader.next_batch()
             x, y = x.to(device), y.to(device)

requirements.txt2 DELETED Viewed

Binary file (2.52 kB)

rough_work.py DELETED Viewed

File without changes

scripts/generate.py CHANGED Viewed

@@ -1,63 +1,2 @@
-import torch
-import torch.nn.functional as F
-import tiktoken
-from model import GPT
-def generate_text(model, prompt, num_return_sequences=4, max_length=32, device='cuda'):
-        model.eval()
-        enc = tiktoken.get_encoding('gpt2')
-        tokens = enc.encode(prompt)
-        tokens = torch.tensor(tokens, dtype=torch.long)
-        tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1)
-        xgen = tokens.to(device)
-        sample_rng = torch.Generator(device=device)
-        sample_rng.manual_seed(42)
-        while xgen.size(1) < max_length:
-            with torch.no_grad():
-                logits, loss = model(xgen)  # (B, T, vocab_size)
-                logits = logits[:, -1, :]  # (B, vocab_size)
-                probs = F.softmax(logits, dim=-1)
-                topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
-                ix = torch.multinomial(topk_probs, 1, generator=sample_rng)
-                xcol = torch.gather(topk_indices, -1, ix)
-                xgen = torch.cat((xgen, xcol), dim=1)
-        generated_texts = []
-        for i in range(num_return_sequences):
-            tokens = xgen[i, :max_length].tolist()
-            decoded = enc.decode(tokens)
-            generated_texts.append(decoded)
-            print(f"Sample {i + 1}: {decoded}")
-        return generated_texts
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
-print(f"running with {device}")
-checkpoint_path = 'log/model_final.pt'
-print(f"Loading checkpoint from {checkpoint_path}")
-checkpoint = torch.load(checkpoint_path,map_location=device)
-model_config = checkpoint['config']
-model_config.vocab_size = 50304
-model = GPT(model_config)
-model.load_state_dict(checkpoint['model'])
-model.to(device)
-prompt = "Hello, I'm a language model,"
-generated_texts = generate_text(
-        model=model,
-        prompt=prompt,
-        num_return_sequences=4,
-        max_length=32,
-        device=device
-    )






1
2	+ #Inference part not completed