razmars commited on
Commit
1ef6477
·
1 Parent(s): 405a306

Clean up SuperLinear configs, delete legacy files, update model + full_shot

Browse files
config.json DELETED
@@ -1,38 +0,0 @@
1
- {
2
- "_name_or_path": "super_linear",
3
- "architectures": [
4
- "SuperLinearForCausalLM"
5
- ],
6
- "auto_map": {
7
- "AutoConfig": "configuration_super_linear_base.SuperLinearConfigBase",
8
- "AutoModelForCausalLM": "modeling_super_linear.SuperLinearForCausalLM"
9
- },
10
- "auto_regressive": 1,
11
- "d_model": 128,
12
- "dropout": 0.0,
13
- "fft_len": 5000,
14
- "freeze_experts": 1,
15
- "freq_experts": "mean_naive_1/4_1/6_1/7_1/8_1/12_1/14_1/16_1/21_1/24_1/28_1/30_1/32_1/36_1/42_1/48_1/52_1/56_1/60_1/72_1/84_1/90_1/96_1/120_1/144_1/168_1/180_1/224_1/252_1/288_1/336_1/365_1/504_1/672_1/1008_1/1440_1/2016_1/3600",
16
- "inf_pred_len": 96,
17
- "layer_type": "RLinear",
18
- "linear_checkpoints_dir": "checkpoints5",
19
- "linear_checkpoints_path": "/cs/azencot_fsas/MoE/",
20
- "load_linear": 0,
21
- "load_weights" :0,
22
- "max_horizon": 96,
23
- "misc_moe": 10,
24
- "mlp_gating": 0,
25
- "model_type": "super_linear",
26
- "moe": 1,
27
- "moe_n_experts": 12,
28
- "moe_temp": 1,
29
- "noisy_gating_std": 0.1,
30
- "noisy_gating_std_decay": 1,
31
- "pred_len": 96,
32
- "seq_len": 512,
33
- "moe_norm": 0,
34
- "top_k_experts": 12,
35
- "torch_dtype": "float32",
36
- "transformers_version": "4.40.1",
37
- "use_fft": 1
38
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configuration_super_linear.py CHANGED
@@ -1,22 +1,22 @@
1
  from typing import Optional, Tuple
2
  import torch, torch.nn as nn, torch.nn.functional as F
3
- from .configuration_super_linear_base import SuperLinearConfigBase
4
-
 
 
 
 
 
 
5
 
6
  # 1) --------------------------------------------------------------------------
7
  # CONFIG
8
  # -----------------------------------------------------------------------------
9
 
10
 
11
- class SuperLinearConfig(SuperLinearConfigBase):
12
- """
13
- Configuration for the SuperLinear MoE time–series foundation model.
14
- Only *model_type* must be unique inside transformers; the rest mirrors
15
- the __init__ arguments of your original Config object.
16
- """
17
 
18
  model_type = "super_linear"
19
-
20
  def __init__(
21
  self,
22
  seq_len=512,
@@ -39,7 +39,7 @@ class SuperLinearConfig(SuperLinearConfigBase):
39
  load_weights =0,
40
  misc_moe = 10,
41
  mlp_gating = 0,
42
- moe_norm = 1,
43
  model_type= "super_linear",
44
  moe_temp = 1,
45
  noisy_gating_std = 0.1,
@@ -47,6 +47,12 @@ class SuperLinearConfig(SuperLinearConfigBase):
47
  torch_dtype = "float32",
48
  transformers_version = "4.40.1",
49
  use_fft = 1,
 
 
 
 
 
 
50
  **kwargs, # any extra CLI args
51
  ):
52
  self.seq_len = seq_len
@@ -74,4 +80,10 @@ class SuperLinearConfig(SuperLinearConfigBase):
74
  self.use_fft = use_fft
75
  self.fft_len = fft_len
76
  self.dropout = dropout
 
 
 
 
 
 
77
  super().__init__(**kwargs)
 
1
  from typing import Optional, Tuple
2
  import torch, torch.nn as nn, torch.nn.functional as F
3
+ from transformers import (
4
+ PretrainedConfig,
5
+ PreTrainedModel,
6
+ GenerationMixin,
7
+ AutoConfig,
8
+ AutoModelForCausalLM,
9
+ )
10
+ from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
11
 
12
  # 1) --------------------------------------------------------------------------
13
  # CONFIG
14
  # -----------------------------------------------------------------------------
15
 
16
 
17
+ class SuperLinearConfig(PretrainedConfig):
 
 
 
 
 
18
 
19
  model_type = "super_linear"
 
20
  def __init__(
21
  self,
22
  seq_len=512,
 
39
  load_weights =0,
40
  misc_moe = 10,
41
  mlp_gating = 0,
42
+ moe_norm = 0,
43
  model_type= "super_linear",
44
  moe_temp = 1,
45
  noisy_gating_std = 0.1,
 
47
  torch_dtype = "float32",
48
  transformers_version = "4.40.1",
49
  use_fft = 1,
50
+ train_epochs = 30,
51
+ patience = 5,
52
+ lradj = "constant",
53
+ learning_rate = 0.05,
54
+ channel_ind = 0,
55
+ full_size = 0,
56
  **kwargs, # any extra CLI args
57
  ):
58
  self.seq_len = seq_len
 
80
  self.use_fft = use_fft
81
  self.fft_len = fft_len
82
  self.dropout = dropout
83
+ self.train_epochs = train_epochs
84
+ self.patience = patience
85
+ self.lradj = lradj
86
+ self.learning_rate = learning_rate
87
+ self.channel_ind = channel_ind
88
+ self.full_size = full_size
89
  super().__init__(**kwargs)
configuration_super_linear_base.py DELETED
@@ -1,84 +0,0 @@
1
- from typing import Optional, Tuple
2
- import torch, torch.nn as nn, torch.nn.functional as F
3
-
4
- from transformers import (
5
- PretrainedConfig,
6
- PreTrainedModel,
7
- GenerationMixin,
8
- AutoConfig,
9
- AutoModelForCausalLM,
10
- )
11
- from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
12
-
13
- # 1) --------------------------------------------------------------------------
14
- # CONFIG
15
- # -----------------------------------------------------------------------------
16
-
17
-
18
- class SuperLinearConfigBase(PretrainedConfig):
19
- """
20
- Configuration for the SuperLinear MoE time–series foundation model.
21
- Only *model_type* must be unique inside transformers; the rest mirrors
22
- the __init__ arguments of your original Config object.
23
- """
24
-
25
- model_type = "super_linear"
26
-
27
- def __init__(
28
- self,
29
- seq_len=512,
30
- pred_len=96,
31
- inf_pred_len=96,
32
- max_horizon=96,
33
- moe_n_experts=12,
34
- top_k_experts=5,
35
- moe =1,
36
- freq_experts= 'mean_naive_1/4_1/6_1/7_1/8_1/12_1/14_1/16_1/21_1/24_1/28_1/30_1/32_1/36_1/42_1/48_1/52_1/56_1/60_1/72_1/84_1/90_1/96_1/120_1/144_1/168_1/180_1/224_1/252_1/288_1/336_1/365_1/504_1/672_1/1008_1/1440_1/2016_1/3600',
37
- auto_regressive= 1,
38
- d_model= 128,
39
- dropout= 0.0,
40
- fft_len= 5000,
41
- freeze_experts= 1,
42
- layer_type= "RLinear",
43
- linear_checkpoints_dir= "checkpoints5",
44
- linear_checkpoints_path= "/cs/azencot_fsas/MoE/",
45
- load_linear = 0,
46
- load_weights =0,
47
- misc_moe = 10,
48
- mlp_gating = 0,
49
- moe_norm = 1,
50
- model_type= "super_linear",
51
- moe_temp = 1,
52
- noisy_gating_std = 0.1,
53
- noisy_gating_std_decay = 1,
54
- torch_dtype = "float32",
55
- transformers_version = "4.40.1",
56
- use_fft = 1,
57
- **kwargs, # any extra CLI args
58
- ):
59
- self.seq_len = seq_len
60
- self.moe = moe
61
- self.pred_len = pred_len
62
- self.inf_pred_len = inf_pred_len
63
- self.max_horizon = max_horizon
64
- self.auto_regressive = auto_regressive
65
- self.moe_n_experts = moe_n_experts
66
- self.top_k_experts = top_k_experts
67
- self.freq_experts = freq_experts
68
- self.freeze_experts = freeze_experts
69
- self.layer_type = layer_type
70
- self.linear_checkpoints_path = linear_checkpoints_path
71
- self.linear_checkpoints_dir = linear_checkpoints_dir
72
- self.load_linear = load_linear
73
- self.load_weights = load_weights
74
- self.misc_moe = misc_moe
75
- self.noisy_gating_std = noisy_gating_std
76
- self.noisy_gating_std_decay = noisy_gating_std_decay
77
- self.d_model = d_model
78
- self.mlp_gating = mlp_gating
79
- self.moe_norm = moe_norm
80
- self.moe_temp = moe_temp
81
- self.use_fft = use_fft
82
- self.fft_len = fft_len
83
- self.dropout = dropout
84
- super().__init__(**kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configuration_super_linear_fs.py DELETED
@@ -1,90 +0,0 @@
1
- from typing import Optional, Tuple
2
- import torch, torch.nn as nn, torch.nn.functional as F
3
-
4
- from .configuration_super_linear_base import SuperLinearConfigBase
5
- from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
6
-
7
- # 1) --------------------------------------------------------------------------
8
- # CONFIG
9
- # -----------------------------------------------------------------------------
10
-
11
-
12
- class SuperLinearConfigFS(SuperLinearConfigBase):
13
- """
14
- Configuration for the SuperLinear MoE time–series foundation model.
15
- Only *model_type* must be unique inside transformers; the rest mirrors
16
- the __init__ arguments of your original Config object.
17
- """
18
-
19
- model_type = "super_linear"
20
-
21
- def __init__(
22
- self,
23
- seq_len=512,
24
- pred_len=96,
25
- inf_pred_len=96,
26
- max_horizon=96,
27
- moe_n_experts=12,
28
- top_k_experts=5,
29
- moe =1,
30
- freq_experts= 'mean_naive_1/4_1/6_1/7_1/8_1/12_1/14_1/16_1/21_1/24_1/28_1/30_1/32_1/36_1/42_1/48_1/52_1/56_1/60_1/72_1/84_1/90_1/96_1/120_1/144_1/168_1/180_1/224_1/252_1/288_1/336_1/365_1/504_1/672_1/1008_1/1440_1/2016_1/3600',
31
- auto_regressive= 1,
32
- d_model= 128,
33
- dropout= 0.0,
34
- fft_len= 5000,
35
- freeze_experts= 1,
36
- layer_type= "RLinear",
37
- linear_checkpoints_dir= "checkpoints5",
38
- linear_checkpoints_path= "/cs/azencot_fsas/MoE/",
39
- load_linear = 0,
40
- load_weights =0,
41
- misc_moe = 10,
42
- mlp_gating = 0,
43
- moe_norm = 0,
44
- model_type= "super_linear",
45
- moe_temp = 1,
46
- noisy_gating_std = 0.1,
47
- noisy_gating_std_decay = 1,
48
- torch_dtype = "float32",
49
- transformers_version = "4.40.1",
50
- use_fft = 1,
51
- train_epochs = 30,
52
- patience = 5,
53
- lradj = "constant",
54
- learning_rate = 0.05,
55
- channel_ind = 0,
56
- full_size = 0,
57
- **kwargs, # any extra CLI args
58
- ):
59
- self.seq_len = seq_len
60
- self.moe = moe
61
- self.pred_len = pred_len
62
- self.inf_pred_len = inf_pred_len
63
- self.max_horizon = max_horizon
64
- self.auto_regressive = auto_regressive
65
- self.moe_n_experts = moe_n_experts
66
- self.top_k_experts = top_k_experts
67
- self.freq_experts = freq_experts
68
- self.freeze_experts = freeze_experts
69
- self.layer_type = layer_type
70
- self.linear_checkpoints_path = linear_checkpoints_path
71
- self.linear_checkpoints_dir = linear_checkpoints_dir
72
- self.load_linear = load_linear
73
- self.load_weights = load_weights
74
- self.misc_moe = misc_moe
75
- self.noisy_gating_std = noisy_gating_std
76
- self.noisy_gating_std_decay = noisy_gating_std_decay
77
- self.d_model = d_model
78
- self.mlp_gating = mlp_gating
79
- self.moe_norm = moe_norm
80
- self.moe_temp = moe_temp
81
- self.use_fft = use_fft
82
- self.fft_len = fft_len
83
- self.dropout = dropout
84
- self.train_epochs = train_epochs
85
- self.patience = patience
86
- self.lradj = lradj
87
- self.learning_rate = learning_rate
88
- self.channel_ind = channel_ind
89
- self.full_size = full_size
90
- super().__init__(**kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
full_shot/config.json CHANGED
@@ -4,7 +4,7 @@
4
  "SuperLinearForCausalLM"
5
  ],
6
  "auto_map": {
7
- "AutoConfig": "configuration_super_linear_fs.SuperLinearConfigFS",
8
  "AutoModelForCausalLM": "modeling_super_linear.SuperLinearForCausalLM"
9
  },
10
  "auto_regressive": 1,
 
4
  "SuperLinearForCausalLM"
5
  ],
6
  "auto_map": {
7
+ "AutoConfig": "configuration_super_linear.SuperLinearConfig",
8
  "AutoModelForCausalLM": "modeling_super_linear.SuperLinearForCausalLM"
9
  },
10
  "auto_regressive": 1,
modeling_super_linear.py CHANGED
@@ -4,9 +4,7 @@ import torch, torch.nn as nn, torch.nn.functional as F
4
 
5
  from transformers import (PreTrainedModel,GenerationMixin,AutoConfig,AutoModelForCausalLM,)
6
  from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
7
- from .configuration_super_linear_base import SuperLinearConfigBase
8
  from .configuration_super_linear import SuperLinearConfig
9
- from .configuration_super_linear_fs import SuperLinearConfigFS
10
 
11
  from typing import Tuple, Union
12
 
@@ -549,9 +547,9 @@ class superLinear(nn.Module):
549
  "-------------------------------------------------------------------------------------------------------------------"
550
  class SuperLinearForCausalLM(PreTrainedModel, GenerationMixin):
551
 
552
- config_class = SuperLinearConfigBase
553
 
554
- def __init__(self, config: Union[SuperLinearConfig, SuperLinearConfigFS]):
555
  super().__init__(config)
556
 
557
  # the backbone keeps its own Config dataclass, so build one on‑the‑fly:
 
4
 
5
  from transformers import (PreTrainedModel,GenerationMixin,AutoConfig,AutoModelForCausalLM,)
6
  from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
 
7
  from .configuration_super_linear import SuperLinearConfig
 
8
 
9
  from typing import Tuple, Union
10
 
 
547
  "-------------------------------------------------------------------------------------------------------------------"
548
  class SuperLinearForCausalLM(PreTrainedModel, GenerationMixin):
549
 
550
+ config_class = SuperLinearConfig
551
 
552
+ def __init__(self, config: SuperLinearConfig):
553
  super().__init__(config)
554
 
555
  # the backbone keeps its own Config dataclass, so build one on‑the‑fly: