import torch
from torch import nn
from transformers import BertModel, GPT2LMHeadModel, BertTokenizer, GPT2Tokenizer
# Load BERT model and tokenizer
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Load GPT-2 model and tokenizer
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2')
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# Ensure GPT-2 tokenizer uses a padding token
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token
class BertGpt2Autoencoder(nn.Module):
def __init__(self, bert_model, gpt2_model):
super(BertGpt2Autoencoder, self).__init__()
self.bert = bert_model
self.gpt2 = gpt2_model
self.latent_to_gpt2 = nn.Linear(bert_model.config.hidden_size, gpt2_model.config.n_embd)
# Add device attribute
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.to(self.device) # Move the model to the appropriate device
def forward(self, input_ids, attention_mask, labels):
# Encode input text using BERT
encoder_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
latent_representation = encoder_outputs.last_hidden_state[:, 0, :] # [CLS] token representation
# Map latent representation to GPT-2 input size
gpt2_input = self.latent_to_gpt2(latent_representation)
gpt2_input = gpt2_input.unsqueeze(1).expand(-1, labels.size(1), -1) # Expand to match label sequence length
# Decode using GPT-2
gpt2_outputs = self.gpt2(inputs_embeds=gpt2_input, labels=labels)
return gpt2_outputs
def prepare_data(text, bert_tokenizer, gpt2_tokenizer):
# Tokenize input text using BERT tokenizer
bert_inputs = bert_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=64)
# Tokenize output text using GPT-2 tokenizer
gpt2_inputs = gpt2_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=64)
return bert_inputs, gpt2_inputs
# Initialize the autoencoder
autoencoder = BertGpt2Autoencoder(bert_model, gpt2_model)
autoencoder = autoencoder.to(autoencoder.device)
# Define optimizer
optimizer = torch.optim.Adam(autoencoder.parameters(), lr=1e-5)
# Sample training data
texts = ["Payment for groceries", "Rent for April", "Monthly subscription fee", "Electricity bill payment"]
# Training loop
for epoch in range(200): # Number of epochs
for text in texts:
# Prepare data
bert_inputs, gpt2_inputs = prepare_data(text, bert_tokenizer, gpt2_tokenizer)
input_ids = bert_inputs['input_ids'].to(autoencoder.device)
attention_mask = bert_inputs['attention_mask'].to(autoencoder.device)
labels = gpt2_inputs['input_ids'].to(autoencoder.device)
# Forward pass
outputs = autoencoder(input_ids, attention_mask, labels)
loss = outputs.loss # Use the loss calculated by GPT-2
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f"Epoch {epoch + 1}, Loss: {loss.item()}")
def detect_anomaly(text, autoencoder, bert_tokenizer, gpt2_tokenizer, threshold=5.0):
bert_inputs, gpt2_inputs = prepare_data(text, bert_tokenizer, gpt2_tokenizer)
input_ids = bert_inputs['input_ids'].to(autoencoder.device)
attention_mask = bert_inputs['attention_mask'].to(autoencoder.device)
# Forward pass
with torch.no_grad():
outputs = autoencoder(input_ids, attention_mask, gpt2_inputs['input_ids'].to(autoencoder.device))
loss = outputs.loss.item() # Get the reconstruction loss
# Anomaly detection based on threshold
if loss > threshold:
print(f"Anomaly detected! Loss: {loss}")
else:
print(f"Text is normal. Loss: {loss}")
# Example usage
detect_anomaly("Urgent transfer to offshore account", autoencoder, bert_tokenizer, gpt2_tokenizer)