from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
from PIL import Image
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
\(\left( \frac S {S_{min}} -1 \right) \left( \frac E {E_{min}} -1 \right) = 1\)
\(L = \Big\{ {\begin{matrix} d(f_{\theta}(x_a),f_{\theta}(x_p)) ~~~~ if ~~ pos\\ \max(0,m-d(f_{\theta}(x_a),f_{\theta}(x_n)) ~~~~ if ~~ neg \end{matrix}}\)
\[L = \max\left(0,m+d(f_{\theta}(x_a),f_{\theta}(x_p))-d(f_{\theta}(x_a),f_{\theta}(x_n))\right)\]
(see github blog)
import pytorch_lightning as pl
class Mod(pl.LightningModule):
def __init__(self):
super().__init__()
self.W = torch.nn.Linear(1,5)
def configure_optimizers(self):
opt = torch.optim.AdamW(self.parameters(), lr = 1e-3)
return opt
def training_step(self, batch, batch_idx):
anc, pos, neg = batch
ea = self.W(anc)
ep = self.W(pos)
en = self.W(neg)
dp = torch.nn.functional.triplet_margin_loss(ea,ep,en)
self.log("train_loss", dp, on_step=False, on_epoch=True)
return dp
class TripDS(torch.utils.data.Dataset):
def __init__(self):
super().__init__()
def __len__(self):
return 1000
def __getitem__(self,i):
if i%2==0:
# pair: on sample une ancre from class 1
xa = torch.randn(1)/10.-0.5
xp = torch.randn(1)/10.-0.5
xn = torch.randn(1)/10.+0.5
return xa,xp,xn
else:
# impair: on sample une ancre from class 2
xa = torch.randn(1)/10.+0.5
xp = torch.randn(1)/10.+0.5
xn = torch.randn(1)/10.-0.5
return xa,xp,xn
traindata = TripDS()
trainloader = torch.utils.data.DataLoader(traindata, batch_size=1, shuffle=False)
mod = Mod()
logger = pl.loggers.TensorBoardLogger(save_dir="logs/", flush_secs=1)
trainer = pl.Trainer(limit_train_batches=1.0, max_epochs=1000, log_every_n_steps=1,logger=logger)
trainer.fit(model=mod, train_dataloaders=trainloader)
tensorboard --logdir=lightning_logs/
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
from PIL import Image
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
image = Image.open("difftrain.png")
inputs = processor(text=["a rabbit","a curve","a chair"], images=image, return_tensors="pt", padding=True)
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
from diffusers import StableDiffusionPipeline
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=True)
\[L\leq E_{x_0\sim p_{data},x_{1:T}\sim q(x_{1:T}|x_0)}\left[-\log p(x_T)-\sum_{t\geq 1} \log \frac {p_\theta(x_{t-1}|x_t)}{q(x_t|x_{t-1})}\right]\]
from diffusers import StableDiffusionPipeline
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=True)
prompt = "a photo of cat playing with a rat"
image = pipe(prompt, guidance_scale=7.5).images[0]
image.save("cat.png")