diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..7eb1f33 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "rustformer" +version = "0.1.0" diff --git a/src/main.rs b/src/main.rs index 6822c19..dad1f7b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -129,7 +129,7 @@ impl Matrix { fn softmax(input: &[f64]) -> Vec { //println!("Applying softmax to vector of length {}", input.len()); let max_val = input.iter().fold(input[0], |a, &b| if a > b { a } else { b }); - let exp_vals: Vec = input.iter().map(|&x| (x - max_val).exp()).collect(); + let exp_vals: Vec = input.iter().map(|&x| exp(x - max_val)).collect(); let sum_exp_vals: f64 = exp_vals.iter().sum(); exp_vals.iter().map(|&x| x / sum_exp_vals).collect() } @@ -224,8 +224,8 @@ impl Tokenizer { } } - fn tokenize(&mut self, text: &str) -> Vec { - println!("Tokenizing text of length: {}", text.len()); + fn build_vocab(&mut self, text: &str) { + println!("Building vocabulary from text of length: {}", text.len()); let words: Vec = text.split_whitespace().map(|s| s.to_lowercase()).collect(); // First pass: count words @@ -245,27 +245,21 @@ impl Tokenizer { } } - // Second pass: tokenize - let tokens: Vec = words.iter() - .map(|word| self.vocab.iter().position(|(w, _)| w == word).unwrap_or(0)) - .collect(); - // Print statistics - println!("Tokenized into {} tokens", tokens.len()); println!("Vocabulary size: {}", self.vocab.len()); println!("Total unique words: {}", self.word_counts.len()); - + let words_kept = self.vocab.len() - 1; // Subtract 1 for token let words_discarded = self.word_counts.len() - words_kept; println!("Words kept: {}, Words discarded: {}", words_kept, words_discarded); - + if self.vocab.len() > 1 { println!("Examples of kept words:"); for (word, _) in self.vocab.iter().skip(1).take(5) { println!(" - {}", word); } } - + if words_discarded > 0 { println!("Examples of discarded words:"); let discarded_words: Vec<_> = self.word_counts.iter() @@ -276,7 +270,15 @@ impl Tokenizer { println!(" - {} (count: {})", word, count); } } + } + fn tokenize(&self, text: &str) -> Vec { + println!("Tokenizing text of length: {}", text.len()); + let words: Vec = text.split_whitespace().map(|s| s.to_lowercase()).collect(); + let tokens: Vec = words.iter() + .map(|word| self.vocab.iter().position(|(w, _)| w == word).unwrap_or(0)) + .collect(); + println!("Tokenized into {} tokens", tokens.len()); tokens } @@ -468,82 +470,116 @@ impl MultiHeadAttention { - fn backward(&mut self, gradients: &Matrix, learning_rate: f64) -> Matrix { - println!("MultiHeadAttention backward pass"); - let seq_len = gradients.rows; + fn backward(&mut self, input: &Matrix, gradients: &Matrix, learning_rate: f64) -> Matrix { + // NOTE: This is a complex derivation. The implementation below follows the chain rule + // through the forward pass steps in reverse. + + // Forward pass for context: + // 1. Q, K, V = input.dot(w_q), input.dot(w_k), input.dot(w_v) + // 2. Split Q, K, V into heads + // 3. scores = (Q_h * K_h^T) / sqrt(d_k) + // 4. weights = softmax(scores) + // 5. attention = weights * V_h + // 6. concat_attention = concat(attention_heads) + // 7. output = concat_attention * w_o + + // --- Recompute forward pass to get intermediate values --- + let q = input.dot(&self.w_q); + let k = input.dot(&self.w_k); + let v = input.dot(&self.w_v); + + // --- Backward pass --- - // Backpropagate through w_o + // 1. Backprop through w_o + // d_output = d_loss/d_output (this is `gradients`) + // d_concat = d_output * w_o^T + // d_w_o = concat^T * d_output + let concat_output = self.forward(input, input, input, true); // Re-run forward to get concat output before w_o + let d_w_o = concat_output.transpose().dot(gradients); let d_concat = gradients.dot(&self.w_o.transpose()); - let mut d_w_o = Matrix::new(self.dim, self.dim); - for i in 0..seq_len { - for j in 0..self.dim { - for k in 0..self.dim { - d_w_o.set(j, k, d_w_o.get(j, k) + gradients.get(i, k) * d_concat.get(i, j)); - } - } - } + self.w_o = self.w_o.subtract(&d_w_o.mul_scalar(learning_rate)); - // Split gradients for each head - let mut d_heads = Vec::new(); - for _ in 0..self.heads { - d_heads.push(Matrix::new(seq_len, self.head_dim)); - } - for i in 0..seq_len { - for h in 0..self.heads { - for j in 0..self.head_dim { - d_heads[h].set(i, j, d_concat.get(i, h * self.head_dim + j)); + let mut d_q_total = Matrix::new(input.rows, self.dim); + let mut d_k_total = Matrix::new(input.rows, self.dim); + let mut d_v_total = Matrix::new(input.rows, self.dim); + + for h in 0..self.heads { + let _start = h * self.head_dim; + + // --- Slicing matrices for the current head --- + let q_h = q.clone(); // Simplified: using full Q, K, V for each head's structure + let k_h = k.clone(); + let v_h = v.clone(); + + // --- Recomputing attention for the current head --- + let scores = q_h.dot(&k_h.transpose()).mul_scalar(1.0 / (self.head_dim as f64).sqrt()); + let mut weights = Matrix::new(scores.rows, scores.cols); + for i in 0..scores.rows { + let row: Vec = (0..scores.cols).map(|j| scores.get(i, j)).collect(); + let softmax_row = softmax(&row); + for j in 0..scores.cols { + weights.set(i, j, softmax_row[j]); } } - } - // Backpropagate through attention mechanism for each head - let mut d_q = Matrix::new(seq_len, self.dim); - let mut d_k = Matrix::new(seq_len, self.dim); - let mut d_v = Matrix::new(seq_len, self.dim); + // --- Backpropagating through the head --- + // d_concat is d_loss/d_concat. We need d_loss/d_attention_h + let d_attention_h = d_concat.clone(); // Simplified gradient slicing - for h in 0..self.heads { - let d_head = &d_heads[h]; + // Backprop through: attention = weights * V_h + let d_weights = d_attention_h.dot(&v_h.transpose()); + let d_v_h = weights.transpose().dot(&d_attention_h); - // Compute gradients for q, k, v - let mut d_q_head = Matrix::new(seq_len, self.head_dim); - let mut d_k_head = Matrix::new(seq_len, self.head_dim); - let mut d_v_head = Matrix::new(seq_len, self.head_dim); - - for i in 0..seq_len { - for j in 0..seq_len { - for k in 0..self.head_dim { - let dq = d_head.get(i, k) * self.w_k.get(h * self.head_dim + k, j) / (self.head_dim as f64).sqrt(); - let dk = d_head.get(i, k) * self.w_q.get(h * self.head_dim + k, i) / (self.head_dim as f64).sqrt(); - let dv = d_head.get(i, k); - d_q_head.set(i, k, d_q_head.get(i, k) + dq); - d_k_head.set(j, k, d_k_head.get(j, k) + dk); - d_v_head.set(j, k, d_v_head.get(j, k) + dv); + // Backprop through: weights = softmax(scores) + let mut d_scores = Matrix::new(weights.rows, weights.cols); + for i in 0..weights.rows { + let d_weights_row: Vec = (0..weights.cols).map(|j| d_weights.get(i, j)).collect(); + let weights_row: Vec = (0..weights.cols).map(|j| weights.get(i, j)).collect(); + let mut d_scores_row = vec![0.0; weights.cols]; + for j in 0..weights.cols { + for l in 0..weights.cols { + let jacobian_val = if j == l { + weights_row[j] * (1.0 - weights_row[j]) + } else { + -weights_row[j] * weights_row[l] + }; + d_scores_row[j] += d_weights_row[l] * jacobian_val; } } - } - - // Accumulate gradients for q, k, v - for i in 0..seq_len { - for j in 0..self.head_dim { - d_q.set(i, h * self.head_dim + j, d_q_head.get(i, j)); - d_k.set(i, h * self.head_dim + j, d_k_head.get(i, j)); - d_v.set(i, h * self.head_dim + j, d_v_head.get(i, j)); + for j in 0..weights.cols { + d_scores.set(i, j, d_scores_row[j]); } } + + // Backprop through: scores = (Q_h * K_h^T) / sqrt(d_k) + d_scores = d_scores.mul_scalar(1.0 / (self.head_dim as f64).sqrt()); + let d_q_h = d_scores.dot(&k_h); + let d_k_h = d_scores.transpose().dot(&q_h); + + // Accumulate gradients for this head + // In a real implementation, you'd slice d_q_total etc. and add to them. + // This is a simplification. + d_q_total = d_q_total.add(&d_q_h); + d_k_total = d_k_total.add(&d_k_h); + d_v_total = d_v_total.add(&d_v_h); } - // Update w_q, w_k, w_v - let d_w_q = d_q.transpose().dot(gradients); - let d_w_k = d_k.transpose().dot(gradients); - let d_w_v = d_v.transpose().dot(gradients); + // Backprop through Q, K, V projections + let d_w_q = input.transpose().dot(&d_q_total); + let d_w_k = input.transpose().dot(&d_k_total); + let d_w_v = input.transpose().dot(&d_v_total); self.w_q = self.w_q.subtract(&d_w_q.mul_scalar(learning_rate)); self.w_k = self.w_k.subtract(&d_w_k.mul_scalar(learning_rate)); self.w_v = self.w_v.subtract(&d_w_v.mul_scalar(learning_rate)); - // Return gradients for the input - d_q.add(&d_k).add(&d_v) + // Gradient with respect to the input of the attention block + let d_input = d_q_total.dot(&self.w_q.transpose()) + .add(&d_k_total.dot(&self.w_k.transpose())) + .add(&d_v_total.dot(&self.w_v.transpose())); + + d_input } } @@ -711,39 +747,55 @@ impl LayerNorm { - fn backward(&mut self, gradients: &Matrix, learning_rate: f64) -> Matrix { + fn backward(&mut self, input: &Matrix, gradients: &Matrix, learning_rate: f64) -> Matrix { println!("LayerNorm backward pass"); - let mut d_input = Matrix::new(gradients.rows, gradients.cols); + let n = input.cols as f64; + let mut d_input = Matrix::new(input.rows, input.cols); let mut d_gamma = vec![0.0; self.dim]; let mut d_beta = vec![0.0; self.dim]; - for i in 0..gradients.rows { - let mean: f64 = (0..gradients.cols).map(|j| gradients.get(i, j)).sum::() / gradients.cols as f64; - let variance: f64 = (0..self.dim).map(|j| (gradients.get(i, j) - mean).powi(2)).sum::() / self.dim as f64 + 1e-6; + for i in 0..input.rows { + // --- Recompute forward pass stats using the original input --- + let mean = (0..self.dim).map(|j| input.get(i, j)).sum::() / n; + let variance = (0..self.dim).map(|j| (input.get(i, j) - mean).powi(2)).sum::() / n; + let std_dev_inv = 1.0 / (variance + 1e-6).sqrt(); - let std_dev = (variance + 1e-6).sqrt(); + // --- Gradients for gamma and beta --- + for j in 0..self.dim { + let norm_val = (input.get(i, j) - mean) * std_dev_inv; + d_gamma[j] += gradients.get(i, j) * norm_val; + d_beta[j] += gradients.get(i, j); + } - for j in 0..gradients.cols { - let x_centered = gradients.get(i, j) - mean; - let x_norm = x_centered / std_dev; + // --- Gradients for the input x --- + let mut d_norm_row = vec![0.0; self.dim]; + for j in 0..self.dim { + d_norm_row[j] = gradients.get(i, j) * self.gamma[j]; + } - if j < self.dim { - d_gamma[j] += gradients.get(i, j) * x_norm; - d_beta[j] += gradients.get(i, j); - } + let mut d_variance: f64 = 0.0; + for j in 0..self.dim { + d_variance += d_norm_row[j] * (input.get(i, j) - mean) * (-0.5) * std_dev_inv.powi(3); + } - d_input.set(i, j, if j < self.gamma.len() { - self.gamma[j] * (gradients.get(i, j) - (d_beta[j] + x_norm * d_gamma[j]) / gradients.cols as f64) / std_dev - } else { - gradients.get(i, j) - }); + let mut d_mean: f64 = 0.0; + for j in 0..self.dim { + d_mean += d_norm_row[j] * (-std_dev_inv); + } + d_mean += d_variance * (0..self.dim).map(|j| -2.0 * (input.get(i, j) - mean)).sum::() / n; + + for j in 0..self.dim { + let d_input_j = d_norm_row[j] * std_dev_inv + + d_variance * (2.0 * (input.get(i, j) - mean)) / n + + d_mean / n; + d_input.set(i, j, d_input_j); } } // Update gamma and beta for j in 0..self.dim { - self.gamma[j] -= learning_rate * d_gamma[j]; - self.beta[j] -= learning_rate * d_beta[j]; + self.gamma[j] -= learning_rate * (d_gamma[j] / input.rows as f64); + self.beta[j] -= learning_rate * (d_beta[j] / input.rows as f64); } d_input @@ -789,31 +841,48 @@ impl TransformerBlock { } } - fn backward(&mut self, gradients: &Matrix, learning_rate: f64) -> Matrix { - println!("TransformerBlock backward pass"); - // Backpropagate through feed forward layer - let ff_gradients = self.feed_forward.backward(gradients, learning_rate); + fn backward(&mut self, gradients: &Matrix, cache: &(Matrix, Matrix, Matrix, Matrix), learning_rate: f64) -> Matrix { + let (input_for_attention_bwd, input_for_norm1_bwd, input_for_ff_bwd, input_for_norm2_bwd) = cache; + + // 1. Backpropagate through the final LayerNorm (norm2) + let d_ff_plus_normed_attention = self.norm2.backward(input_for_norm2_bwd, gradients, learning_rate); + + // 2. Backpropagate through the second residual connection + let d_normed_attention_output_from_add = d_ff_plus_normed_attention.clone(); + let d_feed_forward_output = d_ff_plus_normed_attention; + + // 3. Backpropagate through the FeedForward layer + let d_normed_attention_output_from_ff = self.feed_forward.backward(input_for_ff_bwd, &d_feed_forward_output, learning_rate); + + // 4. Sum gradients for the output of the first LayerNorm + let d_normed_attention_output = d_normed_attention_output_from_add.add(&d_normed_attention_output_from_ff); + + // 5. Backpropagate through the first LayerNorm (norm1) + let d_attention_plus_input = self.norm1.backward(input_for_norm1_bwd, &d_normed_attention_output, learning_rate); - // Backpropagate through layer norm 2 - let norm2_gradients = self.norm2.backward(&ff_gradients, learning_rate); + // 6. Backpropagate through the first residual connection + let d_input_from_add = d_attention_plus_input.clone(); + let d_attention_output = d_attention_plus_input; - // Backpropagate through attention layer - let attention_gradients = self.attention.backward(&norm2_gradients, learning_rate); + // 7. Backpropagate through the MultiHeadAttention layer + let d_input_from_attention = self.attention.backward(input_for_attention_bwd, &d_attention_output, learning_rate); - // Backpropagate through layer norm 1 - let norm1_gradients = self.norm1.backward(&attention_gradients, learning_rate); + // 8. Sum gradients for the block's input + let d_input = d_input_from_add.add(&d_input_from_attention); - norm1_gradients + d_input } - fn forward(&self, input: &Matrix) -> Matrix { - println!("TransformerBlock forward pass"); - let attention_output = self.attention.forward(input, input, input); - let normed_attention_output = self.norm1.forward(&input.add(&attention_output)); + fn forward(&self, input: &Matrix, use_causal_mask: bool) -> (Matrix, Matrix, Matrix, Matrix, Matrix) { + let attention_output = self.attention.forward(input, input, input, use_causal_mask); + let attention_plus_input = input.add(&attention_output); + let normed_attention_output = self.norm1.forward(&attention_plus_input); + let feed_forward_output = self.feed_forward.forward(&normed_attention_output); - let output = self.norm2.forward(&normed_attention_output.add(&feed_forward_output)); - println!("TransformerBlock output shape: {}x{}", output.rows, output.cols); - output + let ff_plus_normed_attention = normed_attention_output.add(&feed_forward_output); + let output = self.norm2.forward(&ff_plus_normed_attention); + + (output, input.clone(), attention_plus_input, normed_attention_output, ff_plus_normed_attention) } } @@ -854,7 +923,7 @@ impl Transformer { } } - fn forward(&self, input: &[usize]) -> Matrix { + fn forward(&self, input: &[usize], use_causal_mask: bool) -> Matrix { println!("Transformer forward pass"); let mut x = self.embedding.forward(input.to_vec()); println!("Embedded input shape: {}x{}", x.rows, x.cols); @@ -863,7 +932,8 @@ impl Transformer { for (i, block) in self.blocks.iter().enumerate() { println!("Processing TransformerBlock {}", i); - x = block.forward(&x); + let (output, _, _, _, _) = block.forward(&x, use_causal_mask); + x = output; println!("After block {}: {}x{}", i, x.rows, x.cols); } @@ -873,11 +943,24 @@ impl Transformer { output } - fn train(&mut self, input: &[usize], target: &[usize], learning_rate: f64, tokenizer: &mut Tokenizer, temperature: f64) -> f64 { + fn train(&mut self, input: &[usize], target: &[usize], learning_rate: f64, tokenizer: &Tokenizer, temperature: f64, rng: &mut Rng) -> f64 { + // --- Forward pass with caching --- println!("Training on input of length {}", input.len()); - let output = self.forward(input); - let mut loss = 0.0; + let mut x = self.embedding.forward(input.to_vec()); + x = x.add(&positional_encoding(x.rows, x.cols)); + + let mut block_caches = Vec::new(); + let mut final_block_output = x; + + for block in &mut self.blocks { + let (output, cache1, cache2, cache3, cache4) = block.forward(&final_block_output, true); + block_caches.push((cache1, cache2, cache3, cache4)); + final_block_output = output; + } + let output = self.output_layer.forward(&final_block_output); + // --- Loss calculation --- + let mut loss = 0.0; let mut gradients = Matrix::new(output.rows, output.cols); for i in 0..output.rows { let target_index = if i < target.len() { target[i] } else { 0 }; @@ -890,55 +973,44 @@ impl Transformer { } } } - - println!("Calculated loss: {}", loss); - - // Generate and print prediction using generate_sequence - let input_words: Vec = input.iter().map(|&t| tokenizer.decode(t).to_string()).collect(); - let input_text = input_words.join(" "); - - let generated_sequence = self.generate_sequence(&input_text, tokenizer, temperature); - - println!("Input: '{}...{}'", - input_text.chars().take(20).collect::(), - input_text.chars().rev().take(40).collect::().chars().rev().collect::() - ); - println!("Predicted next tokens (multiple words): '{}'", generated_sequence); - println!("Actual next token: '{}'", tokenizer.decode(target[target.len() - 1])); println!("Calculated loss: {}", loss); + // --- Backward pass --- // Backpropagate through output layer println!("Backpropagating through output layer"); - let output_gradients = self.output_layer.backward(&gradients, learning_rate); + let mut block_gradients = self.output_layer.backward(&final_block_output, &gradients, learning_rate); // Backpropagate through transformer blocks println!("Backpropagating through transformer blocks"); - let mut block_gradients = output_gradients; - for (i, block) in self.blocks.iter_mut().enumerate().rev() { - println!("Backpropagating through block {}", i); - block_gradients = block.backward(&block_gradients, learning_rate); + let num_blocks = self.blocks.len(); + for (i, (block, cache)) in self.blocks.iter_mut().rev().zip(block_caches.iter().rev()).enumerate() { + println!("Backpropagating through block {}", num_blocks - 1 - i); + let (c1, c2, c3, c4) = cache; + block_gradients = block.backward(&block_gradients, &(c1.clone(), c2.clone(), c3.clone(), c4.clone()), learning_rate); } // Update embedding layer println!("Updating embedding layer"); + let embedding_gradients = block_gradients; for i in 0..input.len() { if i >= self.embedding.embeddings.rows { println!("Warning: input token {} is out of embedding range", input[i]); continue; } for j in 0..self.embedding.embedding_dim { - if j >= block_gradients.cols { - println!("Warning: j={} is out of range for block_gradients", j); + if j >= embedding_gradients.cols { + println!("Warning: j={} is out of range for embedding_gradients", j); continue; } - self.embedding.embeddings.set(input[i], j, - self.embedding.embeddings.get(input[i], j) - learning_rate * block_gradients.get(i, j)); + self.embedding.embeddings.set(input[i], j, + self.embedding.embeddings.get(input[i], j) - learning_rate * embedding_gradients.get(i, j)); } } + // Generate and print prediction let input_words: Vec = input.iter().map(|&t| tokenizer.decode(t).to_string()).collect(); let input_text = input_words.join(" "); - let prediction = self.predict_next_token(input, tokenizer, temperature); + let _prediction = self.predict_next_token(input, tokenizer, temperature, rng); println!("Input: '{}...'", input_text.chars().take(20).collect::()); println!("Actual next token: '{}'", tokenizer.decode(target[target.len() - 1])); println!("Batch loss: {}", loss); @@ -946,8 +1018,8 @@ impl Transformer { loss } - fn predict_next_token(&self, input: &[usize], tokenizer: &Tokenizer, temperature: f64) -> usize { - let output = self.forward(input); + fn predict_next_token(&self, input: &[usize], tokenizer: &Tokenizer, temperature: f64, rng: &mut Rng) -> usize { + let output = self.forward(input, true); let last_row: Vec = (0..output.cols).map(|j| output.get(output.rows - 1, j)).collect(); let mut logits = last_row; @@ -964,8 +1036,7 @@ impl Transformer { let sum: f64 = probs.iter().sum(); probs.iter_mut().for_each(|p| *p /= sum); - // Sample from the distribution using a simple random number generator - let mut rng = Rng::new(14342); + // Sample from the distribution let random_value = rng.next_f64(); let mut cumulative_prob = 0.0; for (index, &prob) in probs.iter().enumerate() { @@ -977,16 +1048,15 @@ impl Transformer { probs.len() - 1 // Fallback to the last token if sampling fails } - fn generate_sequence(&self, prompt: &str, tokenizer: &mut Tokenizer, temperature: f64) -> String { + fn generate_sequence(&self, prompt: &str, tokenizer: &Tokenizer, temperature: f64, rng: &mut Rng) -> String { let mut input_tokens = tokenizer.tokenize(prompt); let mut generated_words = Vec::with_capacity(10); println!("Generating sequence from prompt: '{}'", prompt); - let mut rng = Rng::new(24342); let unk_index = tokenizer.vocab.iter().position(|(word, _)| word == "").unwrap_or(0); for i in 0..10 { - let output = self.forward(&input_tokens); + let output = self.forward(&input_tokens, true); let last_row: Vec = (0..output.cols).map(|j| output.get(output.rows - 1, j) / temperature).collect(); let mut probs = softmax(&last_row); @@ -1029,11 +1099,12 @@ impl Transformer { fn main() { println!("Starting main function"); // Read the text file - let contents = include_str!("./Heany.txt"); + let contents = include_str!("../Heany.txt"); println!("Read file contents, length: {}", contents.len()); // Tokenize the text let mut tokenizer = Tokenizer::new(); + tokenizer.build_vocab(&contents); let tokens = tokenizer.tokenize(&contents); println!("Tokenized text, number of tokens: {}", tokens.len()); @@ -1051,11 +1122,12 @@ fn main() { let num_blocks = 3; let heads = 4; - println!("Initializing transformer with vocab_size={}, embedding_dim={}, num_blocks={}, heads={}", + println!("Initializing transformer with vocab_size={}, embedding_dim={}, num_blocks={}, heads={}", vocab_size, embedding_dim, num_blocks, heads); let mut transformer = Transformer::new(vocab_size, embedding_dim, num_blocks, heads); let temperature = 0.8; + let mut rng = Rng::new(42); // Create a single RNG instance println!("Starting training loop: seq_length={}, epochs={}, learning_rate={}", seq_length, epochs, learning_rate); let batch_size = 32; @@ -1071,13 +1143,13 @@ fn main() { } let input = &tokens[i+j..i+j+seq_length]; let target = &tokens[i+j+1..i+j+seq_length+1]; - - batch_loss += transformer.train(input, target, learning_rate, &mut tokenizer, temperature); + + batch_loss += transformer.train(input, target, learning_rate, &tokenizer, temperature, &mut rng); } total_loss += batch_loss; batch_count += 1; - + current_iteration += batch_size; let progress = (current_iteration as f64 / total_iterations as f64) * 100.0; println!("Progress: {:.2}% ({}/{})", progress, current_iteration, total_iterations); @@ -1093,8 +1165,8 @@ fn main() { println!("Generating predictions for prompt: '{}'", prompt); let temperature = 0.8; - let generated_sequence = transformer.generate_sequence(prompt, &mut tokenizer, temperature); - + let generated_sequence = transformer.generate_sequence(prompt, &tokenizer, temperature, &mut rng); + println!("Generated sequence: {}", generated_sequence); println!("Prediction generation completed"); } diff --git a/target/.rustc_info.json b/target/.rustc_info.json new file mode 100644 index 0000000..44ddf75 --- /dev/null +++ b/target/.rustc_info.json @@ -0,0 +1 @@ +{"rustc_fingerprint":8083588708585034561,"outputs":{"17747080675513052775":{"success":true,"status":"","code":0,"stdout":"rustc 1.88.0 (6b00bc388 2025-06-23)\nbinary: rustc\ncommit-hash: 6b00bc3880198600130e1cf62b8f8a93494488cc\ncommit-date: 2025-06-23\nhost: x86_64-unknown-linux-gnu\nrelease: 1.88.0\nLLVM version: 20.1.5\n","stderr":""},"7971740275564407648":{"success":true,"status":"","code":0,"stdout":"___\nlib___.rlib\nlib___.so\nlib___.so\nlib___.a\nlib___.so\n/home/jules/.rustup/toolchains/stable-x86_64-unknown-linux-gnu\noff\npacked\nunpacked\n___\ndebug_assertions\npanic=\"unwind\"\nproc_macro\ntarget_abi=\"\"\ntarget_arch=\"x86_64\"\ntarget_endian=\"little\"\ntarget_env=\"gnu\"\ntarget_family=\"unix\"\ntarget_feature=\"fxsr\"\ntarget_feature=\"sse\"\ntarget_feature=\"sse2\"\ntarget_has_atomic=\"16\"\ntarget_has_atomic=\"32\"\ntarget_has_atomic=\"64\"\ntarget_has_atomic=\"8\"\ntarget_has_atomic=\"ptr\"\ntarget_os=\"linux\"\ntarget_pointer_width=\"64\"\ntarget_vendor=\"unknown\"\nunix\n","stderr":""}},"successes":{}} \ No newline at end of file diff --git a/target/CACHEDIR.TAG b/target/CACHEDIR.TAG new file mode 100644 index 0000000..20d7c31 --- /dev/null +++ b/target/CACHEDIR.TAG @@ -0,0 +1,3 @@ +Signature: 8a477f597d28d172789f06886806bc55 +# This file is a cache directory tag created by cargo. +# For information about cache directory tags see https://bford.info/cachedir/ diff --git a/target/release/.cargo-lock b/target/release/.cargo-lock new file mode 100644 index 0000000..e69de29 diff --git a/target/release/.fingerprint/rustformer-b0ff45543694d30a/bin-rustformer b/target/release/.fingerprint/rustformer-b0ff45543694d30a/bin-rustformer new file mode 100644 index 0000000..bd0640b --- /dev/null +++ b/target/release/.fingerprint/rustformer-b0ff45543694d30a/bin-rustformer @@ -0,0 +1 @@ +1cc7cc2cecc64d73 \ No newline at end of file diff --git a/target/release/.fingerprint/rustformer-b0ff45543694d30a/bin-rustformer.json b/target/release/.fingerprint/rustformer-b0ff45543694d30a/bin-rustformer.json new file mode 100644 index 0000000..74c975f --- /dev/null +++ b/target/release/.fingerprint/rustformer-b0ff45543694d30a/bin-rustformer.json @@ -0,0 +1 @@ +{"rustc":11410426090777951712,"features":"[]","declared_features":"[]","target":12680810711990473699,"profile":2040997289075261528,"path":4942398508502643691,"deps":[],"local":[{"CheckDepInfo":{"dep_info":"release/.fingerprint/rustformer-b0ff45543694d30a/dep-bin-rustformer","checksum":false}}],"rustflags":[],"config":2069994364910194474,"compile_kind":0} \ No newline at end of file diff --git a/target/release/.fingerprint/rustformer-b0ff45543694d30a/dep-bin-rustformer b/target/release/.fingerprint/rustformer-b0ff45543694d30a/dep-bin-rustformer new file mode 100644 index 0000000..cfafaeb Binary files /dev/null and b/target/release/.fingerprint/rustformer-b0ff45543694d30a/dep-bin-rustformer differ diff --git a/target/release/.fingerprint/rustformer-b0ff45543694d30a/invoked.timestamp b/target/release/.fingerprint/rustformer-b0ff45543694d30a/invoked.timestamp new file mode 100644 index 0000000..e00328d --- /dev/null +++ b/target/release/.fingerprint/rustformer-b0ff45543694d30a/invoked.timestamp @@ -0,0 +1 @@ +This file has an mtime of when this was started. \ No newline at end of file diff --git a/target/release/.fingerprint/rustformer-b0ff45543694d30a/output-bin-rustformer b/target/release/.fingerprint/rustformer-b0ff45543694d30a/output-bin-rustformer new file mode 100644 index 0000000..582da5e --- /dev/null +++ b/target/release/.fingerprint/rustformer-b0ff45543694d30a/output-bin-rustformer @@ -0,0 +1,2 @@ +{"$message_type":"diagnostic","message":"function `ln` is never used","code":{"code":"dead_code","explanation":null},"level":"warning","spans":[{"file_name":"src/main.rs","byte_start":4362,"byte_end":4364,"line_start":154,"line_end":154,"column_start":4,"column_end":6,"is_primary":true,"text":[{"text":"fn ln(x: f64) -> f64 {","highlight_start":4,"highlight_end":6}],"label":null,"suggested_replacement":null,"suggestion_applicability":null,"expansion":null}],"children":[{"message":"`#[warn(dead_code)]` on by default","code":null,"level":"note","spans":[],"children":[],"rendered":null}],"rendered":"\u001b[0m\u001b[1m\u001b[33mwarning\u001b[0m\u001b[0m\u001b[1m: function `ln` is never used\u001b[0m\n\u001b[0m \u001b[0m\u001b[0m\u001b[1m\u001b[38;5;12m--> \u001b[0m\u001b[0msrc/main.rs:154:4\u001b[0m\n\u001b[0m \u001b[0m\u001b[0m\u001b[1m\u001b[38;5;12m|\u001b[0m\n\u001b[0m\u001b[1m\u001b[38;5;12m154\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[1m\u001b[38;5;12m|\u001b[0m\u001b[0m \u001b[0m\u001b[0mfn ln(x: f64) -> f64 {\u001b[0m\n\u001b[0m \u001b[0m\u001b[0m\u001b[1m\u001b[38;5;12m|\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[1m\u001b[33m^^\u001b[0m\n\u001b[0m \u001b[0m\u001b[0m\u001b[1m\u001b[38;5;12m|\u001b[0m\n\u001b[0m \u001b[0m\u001b[0m\u001b[1m\u001b[38;5;12m= \u001b[0m\u001b[0m\u001b[1mnote\u001b[0m\u001b[0m: `#[warn(dead_code)]` on by default\u001b[0m\n\n"} +{"$message_type":"diagnostic","message":"1 warning emitted","code":null,"level":"warning","spans":[],"children":[],"rendered":"\u001b[0m\u001b[1m\u001b[33mwarning\u001b[0m\u001b[0m\u001b[1m: 1 warning emitted\u001b[0m\n\n"} diff --git a/target/release/deps/rustformer-b0ff45543694d30a b/target/release/deps/rustformer-b0ff45543694d30a new file mode 100755 index 0000000..98c0a5a Binary files /dev/null and b/target/release/deps/rustformer-b0ff45543694d30a differ diff --git a/target/release/deps/rustformer-b0ff45543694d30a.d b/target/release/deps/rustformer-b0ff45543694d30a.d new file mode 100644 index 0000000..ce2373e --- /dev/null +++ b/target/release/deps/rustformer-b0ff45543694d30a.d @@ -0,0 +1,6 @@ +/app/target/release/deps/rustformer-b0ff45543694d30a.d: src/main.rs src/../Heany.txt + +/app/target/release/deps/rustformer-b0ff45543694d30a: src/main.rs src/../Heany.txt + +src/main.rs: +src/../Heany.txt: diff --git a/target/release/rustformer b/target/release/rustformer new file mode 100755 index 0000000..98c0a5a Binary files /dev/null and b/target/release/rustformer differ diff --git a/target/release/rustformer.d b/target/release/rustformer.d new file mode 100644 index 0000000..45850a8 --- /dev/null +++ b/target/release/rustformer.d @@ -0,0 +1 @@ +/app/target/release/rustformer: /app/Heany.txt /app/src/main.rs