LeroyDyer commited on
Commit
de26508
1 Parent(s): 2a25736

Upload 3 files

Browse files
Files changed (2) hide show
  1. _Train_Model.py +196 -0
  2. configuration_mistral.py +147 -6
_Train_Model.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextGenerationPipeline, AutoConfig, BitsAndBytesConfig,AutoConfig
2
+ import time
3
+ import torch
4
+ torch.backends.cuda.matmul.allow_tf32 = True
5
+ import random
6
+ from datasets import load_dataset
7
+ from transformers import TrainingArguments
8
+ from trl import SFTTrainer
9
+ from peft import LoraConfig
10
+ # from accelerate import infer_auto_device_map, init_empty_weights, dispatch_model
11
+ from torch.nn import CrossEntropyLoss
12
+ torch.autograd.set_detect_anomaly(True)
13
+ random_seed = 42
14
+ torch.manual_seed(random_seed)
15
+ random.seed(random_seed)
16
+ # Set the device for each process
17
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
+ # torch.cuda.set_device(device)
19
+
20
+
21
+
22
+ n_ahead_talk_global = 4
23
+ n_passes_global = 2
24
+ n_ahead_global = 8
25
+ n_examples = 0
26
+
27
+ def model_init(params):
28
+ original = False
29
+ if params is None:
30
+ params = {}
31
+ else:
32
+ params = params.params
33
+ # save params to file
34
+ n_ahead = params.get("n_ahead", n_ahead_global if not original else 1)
35
+ n_ahead_talk = params.get("n_ahead_talk", n_ahead_talk_global if not original else 1)
36
+ n_passes = params.get("n_passes", n_passes_global if not original else 1)
37
+ gumbel_temperature = params.get("gumbel_temperature", 1)
38
+ use_start_thought_token = params.get("use_start_thought_token", True)
39
+ use_end_thought_token = params.get("use_end_thought_token", True)
40
+ include_policy_loss = params.get("include_policy_loss", True)
41
+ gumbel_detach = params.get("gumbel_detach", True)
42
+ merged_talk_heads = params.get("merged_talk_heads", True)
43
+ residual_think_head = params.get("residual_think_head", False)
44
+ optimize_lm_head_only_at_start = params.get("optimize_lm_head_only_at_start", False)
45
+
46
+ model_id = "LeroyDyer/SpydazWeb_AGI_MistralStar"
47
+ tokenizer_id = model_id
48
+ print("Loading model")
49
+
50
+ model = AutoModelForCausalLM.from_pretrained(
51
+ model_id,
52
+ torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
53
+ max_thoughts=n_ahead + n_ahead_talk + 1,
54
+ merged_talk_heads=merged_talk_heads,
55
+ merged_lm_and_talk_heads=False,
56
+ merged_lm_and_think_heads=True,
57
+ use_concat_talk_head=True,
58
+ use_shallow_think=True,
59
+ use_shallow_talk=False,
60
+ use_complex_think_head=False,
61
+ use_complex_talk_head=True,
62
+ use_weighted_talk_head=True,
63
+ trust_remote_code=True,
64
+ device_map="auto",
65
+ )
66
+ print("Loaded model")
67
+
68
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_id, truncation=True, padding_side="right")
69
+ tokenizer.pad_token_id = tokenizer.eos_token_id
70
+
71
+
72
+
73
+ model.gumbel_detach = gumbel_detach
74
+ model.include_policy_loss = include_policy_loss
75
+ model.use_end_thought_token = use_end_thought_token
76
+ model.use_start_thought_token = use_start_thought_token
77
+ model.n_ahead = n_ahead
78
+ model.n_ahead_talk = n_ahead_talk
79
+ model.n_passes = n_passes
80
+ model.residual_think_head = residual_think_head
81
+ model.optimize_lm_head_only_at_start = optimize_lm_head_only_at_start
82
+ model.gumbel_temperature = gumbel_temperature
83
+ model.original_mode = original
84
+ model.config_params = params
85
+ return model
86
+
87
+ model,tokenizer = model_init(None)
88
+
89
+
90
+ ## TRAINING :
91
+
92
+ peft_config = LoraConfig(
93
+ r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
94
+ target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
95
+ "gate_proj", "up_proj", "down_proj","lm_head", "embed_tokens"],
96
+ lora_alpha = 32,
97
+ lora_dropout = 0, # Supports any, but = 0 is optimized
98
+ bias = "none",
99
+ use_dora=True,
100
+ )
101
+
102
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextGenerationPipeline, AutoConfig
103
+ from datasets import load_dataset
104
+ from transformers import TrainingArguments
105
+ from trl import SFTTrainer
106
+ from peft import LoraConfig
107
+
108
+ ## DATA
109
+ alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
110
+
111
+ ### Instruction:
112
+ {}
113
+
114
+ ### Input:
115
+ {}
116
+
117
+ ### Response:
118
+ {}"""
119
+ EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
120
+ def formatting_prompts_func(examples):
121
+ instructions = examples["instruction"]
122
+ inputs = examples["input"]
123
+ outputs = examples["output"]
124
+ texts = []
125
+ for instruction, input, output in zip(instructions, inputs, outputs):
126
+ # Must add EOS_TOKEN, otherwise your generation will go on forever!
127
+ text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
128
+ texts.append(text)
129
+ return { "text" : texts, }
130
+ pass
131
+ dataset = load_dataset("gate369/Alpaca-Star", split = "train[:2000]")
132
+ dataset = dataset.shuffle(seed=3704)
133
+ dataset = dataset.map(formatting_prompts_func, batched = True,)
134
+ ## TRAIN
135
+
136
+ max_seq_length = 1024
137
+ training_args = TrainingArguments(
138
+ output_dir="./out",
139
+ num_train_epochs=3,
140
+ per_device_train_batch_size=1,
141
+ gradient_checkpointing=False,
142
+ gradient_accumulation_steps=8,
143
+ optim="lion_32bit",
144
+ logging_steps=1,
145
+ save_strategy="steps",
146
+ max_steps=1000,
147
+ bf16=True,
148
+ tf32=False,
149
+ learning_rate=6e-05,
150
+ max_grad_norm=0.3,
151
+ warmup_ratio=0.06,
152
+ lr_scheduler_type="cosine",
153
+ push_to_hub=False,
154
+
155
+ )
156
+ trainer = SFTTrainer(
157
+ args=training_args,
158
+ train_dataset=dataset,
159
+ model=model,
160
+ tokenizer=tokenizer,
161
+ max_seq_length=max_seq_length,
162
+ dataset_text_field="text",
163
+ peft_config=peft_config,
164
+ )
165
+ trainer.train()
166
+
167
+ ## SAVE
168
+ tokenizer.save_pretrained("SFTTrainerModel")
169
+ model.save_pretrained("SFTTrainerModel")
170
+
171
+
172
+ import os
173
+ import huggingface_hub
174
+ from huggingface_hub import notebook_login
175
+ from huggingface_hub import create_repo, HfApi
176
+ from huggingface_hub import hf_hub_download
177
+ from huggingface_hub import create_repo, HfApi
178
+ from huggingface_hub import snapshot_download
179
+
180
+ MODEL_NAME = "_Spydaz_Web_AI_MistralStar"
181
+ Folderinput = "SFTTrainerModel"
182
+ WRITE_TOKEN = ""
183
+ username = "LeroyDyer"
184
+ huggingface_hub.login(WRITE_TOKEN)
185
+ api = HfApi(token=WRITE_TOKEN)
186
+ # Create empty repo
187
+ api.create_repo(
188
+ repo_id = f"{username}/{MODEL_NAME}",
189
+ repo_type="model",
190
+ exist_ok=True,
191
+ )
192
+
193
+ api.upload_folder(
194
+ repo_id = f"{username}/{MODEL_NAME}",
195
+ folder_path = Folderinput
196
+ )
configuration_mistral.py CHANGED
@@ -12,7 +12,7 @@
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
- """ Mistral model configuration"""
16
 
17
  from ...configuration_utils import PretrainedConfig
18
  from ...utils import logging
@@ -20,11 +20,6 @@ from ...utils import logging
20
 
21
  logger = logging.get_logger(__name__)
22
 
23
- MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
24
- "mistralai/Mistral-7B-v0.1": "https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json",
25
- "mistralai/Mistral-7B-Instruct-v0.1": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json",
26
- }
27
-
28
 
29
  class MistralConfig(PretrainedConfig):
30
  r"""
@@ -163,6 +158,152 @@ class MistralConfig(PretrainedConfig):
163
  self.use_complex_talk_head = use_complex_talk_head
164
  self.use_weighted_talk_head = use_weighted_talk_head
165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  super().__init__(
167
  pad_token_id=pad_token_id,
168
  bos_token_id=bos_token_id,
 
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
+ """Mistral model configuration"""
16
 
17
  from ...configuration_utils import PretrainedConfig
18
  from ...utils import logging
 
20
 
21
  logger = logging.get_logger(__name__)
22
 
 
 
 
 
 
23
 
24
  class MistralConfig(PretrainedConfig):
25
  r"""
 
158
  self.use_complex_talk_head = use_complex_talk_head
159
  self.use_weighted_talk_head = use_weighted_talk_head
160
 
161
+ super().__init__(
162
+ pad_token_id=pad_token_id,
163
+ bos_token_id=bos_token_id,
164
+ eos_token_id=eos_token_id,
165
+ tie_word_embeddings=tie_word_embeddings,
166
+ **kwargs,
167
+ )
168
+ class MistralStarConfig(PretrainedConfig):
169
+ r"""
170
+ This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
171
+ Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
172
+ with the defaults will yield a similar configuration to that of the Mistral-7B-v0.1 or Mistral-7B-Instruct-v0.1.
173
+
174
+ [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
175
+ [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)
176
+
177
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
178
+ documentation from [`PretrainedConfig`] for more information.
179
+
180
+
181
+ Args:
182
+ vocab_size (`int`, *optional*, defaults to 32000):
183
+ Vocabulary size of the Mistral model. Defines the number of different tokens that can be represented by the
184
+ `inputs_ids` passed when calling [`MistralModel`]
185
+ hidden_size (`int`, *optional*, defaults to 4096):
186
+ Dimension of the hidden representations.
187
+ intermediate_size (`int`, *optional*, defaults to 14336):
188
+ Dimension of the MLP representations.
189
+ num_hidden_layers (`int`, *optional*, defaults to 32):
190
+ Number of hidden layers in the Transformer encoder.
191
+ num_attention_heads (`int`, *optional*, defaults to 32):
192
+ Number of attention heads for each attention layer in the Transformer encoder.
193
+ num_key_value_heads (`int`, *optional*, defaults to 8):
194
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
195
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
196
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
197
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
198
+ by meanpooling all the original heads within that group. For more details checkout [this
199
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
200
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
201
+ The non-linear activation function (function or string) in the decoder.
202
+ max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
203
+ The maximum sequence length that this model might ever be used with. Mistral's sliding window attention
204
+ allows sequence of up to 4096*32 tokens.
205
+ initializer_range (`float`, *optional*, defaults to 0.02):
206
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
207
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
208
+ The epsilon used by the rms normalization layers.
209
+ use_cache (`bool`, *optional*, defaults to `True`):
210
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
211
+ relevant if `config.is_decoder=True`.
212
+ pad_token_id (`int`, *optional*):
213
+ The id of the padding token.
214
+ bos_token_id (`int`, *optional*, defaults to 1):
215
+ The id of the "beginning-of-sequence" token.
216
+ eos_token_id (`int`, *optional*, defaults to 2):
217
+ The id of the "end-of-sequence" token.
218
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
219
+ Whether the model's input and output word embeddings should be tied.
220
+ rope_theta (`float`, *optional*, defaults to 10000.0):
221
+ The base period of the RoPE embeddings.
222
+ sliding_window (`int`, *optional*, defaults to 4096):
223
+ Sliding window attention window size. If not specified, will default to `4096`.
224
+ attention_dropout (`float`, *optional*, defaults to 0.0):
225
+ The dropout ratio for the attention probabilities.
226
+
227
+ ```python
228
+ >>> from transformers import MistralModel, MistralConfig
229
+
230
+ >>> # Initializing a Mistral 7B style configuration
231
+ >>> configuration = MistralConfig()
232
+
233
+ >>> # Initializing a model from the Mistral 7B style configuration
234
+ >>> model = MistralModel(configuration)
235
+
236
+ >>> # Accessing the model configuration
237
+ >>> configuration = model.config
238
+ ```"""
239
+
240
+ model_type = "mistralstar"
241
+ keys_to_ignore_at_inference = ["past_key_values"]
242
+
243
+ def __init__(
244
+ self,
245
+ vocab_size=32000,
246
+ hidden_size=4096,
247
+ intermediate_size=14336,
248
+ num_hidden_layers=32,
249
+ num_attention_heads=32,
250
+ num_key_value_heads=8,
251
+ hidden_act="silu",
252
+ max_position_embeddings=4096 * 32,
253
+ initializer_range=0.02,
254
+ rms_norm_eps=1e-6,
255
+ use_cache=True,
256
+ pad_token_id=None,
257
+ bos_token_id=1,
258
+ eos_token_id=2,
259
+ tie_word_embeddings=False,
260
+ rope_theta=10000.0,
261
+ sliding_window=4096,
262
+ attention_dropout=0.0,
263
+ max_thoughts=16,
264
+ thought_length = 10,
265
+ merged_talk_heads=True,
266
+ merged_lm_and_talk_heads=False,
267
+ merged_lm_and_think_heads=True,
268
+ use_concat_talk_head=True,
269
+ use_shallow_think=True,
270
+ use_shallow_talk=False,
271
+ use_complex_think_head=False,
272
+ use_complex_talk_head=True,
273
+ use_weighted_talk_head=True,
274
+ **kwargs,
275
+ ):
276
+ self.vocab_size = vocab_size
277
+ self.max_position_embeddings = max_position_embeddings
278
+ self.hidden_size = hidden_size
279
+ self.intermediate_size = intermediate_size
280
+ self.num_hidden_layers = num_hidden_layers
281
+ self.num_attention_heads = num_attention_heads
282
+ self.sliding_window = sliding_window
283
+
284
+ # for backward compatibility
285
+ if num_key_value_heads is None:
286
+ num_key_value_heads = num_attention_heads
287
+
288
+ self.num_key_value_heads = num_key_value_heads
289
+ self.hidden_act = hidden_act
290
+ self.initializer_range = initializer_range
291
+ self.rms_norm_eps = rms_norm_eps
292
+ self.use_cache = use_cache
293
+ self.rope_theta = rope_theta
294
+ self.attention_dropout = attention_dropout
295
+ self.max_thoughts = max_thoughts
296
+ self.thought_length = thought_length
297
+ self.merged_talk_heads = merged_talk_heads
298
+ self.merged_lm_and_talk_heads = merged_lm_and_talk_heads
299
+ self.merged_lm_and_think_heads = merged_lm_and_think_heads
300
+ self.use_concat_talk_head = use_concat_talk_head
301
+ self.use_shallow_think = use_shallow_think
302
+ self.use_shallow_talk = use_shallow_talk
303
+ self.use_complex_think_head = use_complex_think_head
304
+ self.use_complex_talk_head = use_complex_talk_head
305
+ self.use_weighted_talk_head = use_weighted_talk_head
306
+
307
  super().__init__(
308
  pad_token_id=pad_token_id,
309
  bos_token_id=bos_token_id,