Objectives:
4 exercices next:
pip install transformers[torch]
from transformers import AutoTokenizer, AutoModel, pipeline
model = AutoModel.from_pretrained('distilbert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
nlp = pipeline('feature-extraction', model=model, tokenizer=tokenizer)
s = 'Do you like cakes ?'
features = nlp(s)
print([features[0][i][:2] for i in range(len(features[0]))])
inputs = tokenizer.encode_plus(s, add_special_tokens=True, return_tensors="pt")
input_ids = inputs["input_ids"].tolist()[0]
text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
print(text_tokens)
pip install sklearn
from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(v1,v2))
Answer the question: do BERT embeddings embed POS information ?
Methodology:
import nltk
nltk.download('brown')
nltk.download('universal_tagset')
nltk.corpus.brown.sents()
nltk.corpus.brown.tagged_words(tagset='universal')
Universal tagset:
VERB - verbs (all tenses and modes)
NOUN - nouns (common and proper)
PRON - pronouns
ADJ - adjectives
ADV - adverbs
ADP - adpositions (prepositions and postpositions)
CONJ - conjunctions
DET - determiners
NUM - cardinal numbers
PRT - particles or other function words
X - other: foreign words, typos, abbreviations
. - punctuation
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(X, y)
clf.predict(X[:2, :])
clf.predict_proba(X[:2, :])
clf.score(X, y)
Another reference on this subject: https://pageperso.lis-lab.fr/benoit.favre/pstaln/09_embedding_evaluation.html
(see github blog)
import pytorch_lightning as pl
class Mod(pl.LightningModule):
def __init__(self):
super().__init__()
self.W = torch.nn.Linear(1,5)
def configure_optimizers(self):
opt = torch.optim.AdamW(self.parameters(), lr = 1e-3)
return opt
def training_step(self, batch, batch_idx):
anc, pos, neg = batch
ea = self.W(anc)
ep = self.W(pos)
en = self.W(neg)
dp = torch.nn.functional.triplet_margin_loss(ea,ep,en)
self.log("train_loss", dp, on_step=False, on_epoch=True)
return dp
class TripDS(torch.utils.data.Dataset):
def __init__(self):
super().__init__()
def __len__(self):
return 1000
def __getitem__(self,i):
if i%2==0:
# pair: on sample une ancre from class 1
xa = torch.randn(1)/10.-0.5
xp = torch.randn(1)/10.-0.5
xn = torch.randn(1)/10.+0.5
return xa,xp,xn
else:
# impair: on sample une ancre from class 2
xa = torch.randn(1)/10.+0.5
xp = torch.randn(1)/10.+0.5
xn = torch.randn(1)/10.-0.5
return xa,xp,xn
traindata = TripDS()
trainloader = torch.utils.data.DataLoader(traindata, batch_size=1, shuffle=False)
mod = Mod()
logger = pl.loggers.TensorBoardLogger(save_dir="logs/", flush_secs=1)
trainer = pl.Trainer(limit_train_batches=1.0, max_epochs=1000, log_every_n_steps=1,logger=logger)
trainer.fit(model=mod, train_dataloaders=trainloader)
tensorboard --logdir=lightning_logs/