christophe.cerisara@loria.fr
What do you propose?
What are the components of a RAG system?
What is the difference between the embedding model and the LLM? Why not a single LLM?
If both embedders and LLM are transformers, what is their difference? (Think training)
import ollama
from numpy.linalg import norm
import numpy as np
# first download data: wget https://olki.loria.fr/cerisara/lexres/frnews.txt
# embedding model:
em="nextfire/paraphrase-multilingual-minilm"
def find_most_similar(needle, haystack):
needle_norm = norm(needle)
similarity_scores = [
np.dot(needle, item) / (needle_norm * norm(item)) for item in haystack
]
print("debug",similarity_scores)
return sorted(zip(similarity_scores, range(len(haystack))), reverse=True)
SYSTEM_PROMPT = """You are a helpful reading assistant who answers questions
based on snippets of text provided in context. Answer only using the context provided,
being as concise as possible. If you're unsure, just say that you don't know.
Context:
"""
with open("frnews.txt","r") as f: lines = f.readlines()
bdd = []
for i,l in enumerate(lines):
if i>=50: break
# see https://sbert.net/examples/applications/computing-embeddings/README.html
embeddings = ollama.embeddings(model=em, prompt=l)["embedding"]
bdd.append(embeddings)
print("bdd built")
q="Dans quelle ville y a-t-il eu des canicules ?\n"
prompt_embedding = ollama.embeddings(model=em, prompt=q)["embedding"]
most_similar_chunks = find_most_similar(prompt_embedding, bdd)[:1]
print("retrieved:",most_similar_chunks,lines[most_similar_chunks[0][1]])
response = ollama.chat(
model="qwen2.5",
messages=[
{
"role": "system",
"content": SYSTEM_PROMPT
+ "\n".join([lines[x[1]] for x in most_similar_chunks]),
},
{"role": "user", "content": q},
],
)
print("\n\n")
print(response["message"]["content"])
# see https://decoder.sh/videos/rag-from-the-ground-up-with-python-and-ollama
BERT is not good enough: similar words are not close enough in its embedding space.
def: embedding space = vector (\(\in R^d\)) that BERT outputs for each word in the sentence. If you pass into BERT every English sentences, you obtain the full embedding space.
Let’s verify!
Hints below…
pip install transformers[torch]
from transformers import AutoTokenizer, AutoModel, pipeline
model = AutoModel.from_pretrained('distilbert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
nlp = pipeline('feature-extraction', model=model, tokenizer=tokenizer)
s = 'Do you like cakes ?'
features = nlp(s)
print([features[0][i][:2] for i in range(len(features[0]))])
inputs = tokenizer.encode_plus(s, add_special_tokens=True, return_tensors="pt")
input_ids = inputs["input_ids"].tolist()[0]
text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
print(text_tokens)
from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(v1,v2))
from sklearn import manifold
tsne = manifold.TSNE(n_components=2, metric="precomputed", perplexity=2)
res = tsne.fit(m)
import matplotlib.pyplot as plt
plt.scatter( coords[:, 0], coords[:, 1], marker = 'o')
plt.show()
\[L=\biggl\{\begin{matrix} d(s_A,s_B) & if~~Positive Pair\\ \max(0,m-d(s_A,s_B)) & if~~Negative Pair \end{matrix}\]
\[L= - \frac 1 N \sum_{i=1}^N \log \frac{e^{sim(s_{A_i},s_0)}}{\frac 1 M \sum_{j=0}^{M} e^{sim(s_{A_i},s_j)}}\]
Once an embedding space is trained, how can you use it to directly perform instance-based classification?
from datasets import load_dataset
ds = load_dataset("fancyzhx/ag_news", split='test')
(see github blog)
import pytorch_lightning as pl
class Mod(pl.LightningModule):
def __init__(self):
super().__init__()
self.W = torch.nn.Linear(1,5)
def configure_optimizers(self):
opt = torch.optim.AdamW(self.parameters(), lr = 1e-3)
return opt
def training_step(self, batch, batch_idx):
anc, pos, neg = batch
ea = self.W(anc)
ep = self.W(pos)
en = self.W(neg)
dp = torch.nn.functional.triplet_margin_loss(ea,ep,en)
self.log("train_loss", dp, on_step=False, on_epoch=True)
return dp
class TripDS(torch.utils.data.Dataset):
def __init__(self):
super().__init__()
def __len__(self):
return 1000
def __getitem__(self,i):
if i%2==0:
# pair: on sample une ancre from class 1
z = random.randint(0,1)
if z==0: xa = torch.randn(1)/10.-0.5
else: xa = torch.randn(1)/10.+1.5
z = random.randint(0,1)
if z==0: xp = torch.randn(1)/10.-0.5
else: xp = torch.randn(1)/10.+1.5
xn = torch.randn(1)/10.+0.5
else:
# impair: on sample une ancre from class 2
xa = torch.randn(1)/10.+0.5
xp = torch.randn(1)/10.+0.5
z = random.randint(0,1)
if z==0: xn = torch.randn(1)/10.-0.5
else: xn = torch.randn(1)/10.+1.5
return xa,xp,xn
traindata = TripDS()
trainloader = torch.utils.data.DataLoader(traindata, batch_size=1, shuffle=False)
mod = Mod()
logger = pl.loggers.TensorBoardLogger(save_dir="logs/", flush_secs=1)
trainer = pl.Trainer(limit_train_batches=1.0, max_epochs=1000, log_every_n_steps=1,logger=logger)
trainer.fit(model=mod, train_dataloaders=trainloader)
tensorboard --logdir=lightning_logs/
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
from PIL import Image
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
image = Image.open("difftrain.png")
inputs = processor(text=["a rabbit","a curve","a chair"], images=image, return_tensors="pt", padding=True)
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
If the LLM good enough, no need to finetune?
Is it possible to get a good enough LLM?
Method | Bits | 7B | 13B | 30B | 70B | 110B | 8x7B | 8x22B |
---|---|---|---|---|---|---|---|---|
Full | 32 | 120GB | 240GB | 600GB | 1200GB | 2000GB | 900GB | 2400GB |
Full | 16 | 60GB | 120GB | 300GB | 600GB | 900GB | 400GB | 1200GB |
LoRA/GaLore/BAdam | 16 | 16GB | 32GB | 64GB | 160GB | 240GB | 120GB | 320GB |
QLoRA | 8 | 10GB | 20GB | 40GB | 80GB | 140GB | 60GB | 160GB |
QLoRA | 4 | 6GB | 12GB | 24GB | 48GB | 72GB | 30GB | 96GB |
QLoRA | 2 | 4GB | 8GB | 16GB | 24GB | 48GB | 18GB | 48GB |
Method | data | notes |
---|---|---|
Pretraining | >10T | Full training |
Cont. pretr. | \(\simeq 100\)b | update: PEFT? |
Finetuning | 1k … 1b | Adapt to task: PEFT |
Few-Shot learning | < 1k | Guide, help the LLM |
\[X=(RELU(X\cdot W_{down})) \cdot W_{up} + X\]
with
\[W_{down} \in R^{d\times k}~~~~W_{up} \in R^{k\times d}\]
Performance comparison
model.add_adapter(lora_config, adapter_name="adapter_1")
model.add_adapter(lora_config, adapter_name="adapter_2")
model.set_adapter("adapter_1")
can be applied to sparse FT
FT an LLM on specific task/lang
extract the mask = params that change most
rewind the LLM and re-FT with mask
sparse finetunes can be combined without overlapping!
Pretraining and FT use same basic algorithm (SGD), but the differences in data size lead to differences in training regimes.
Why such a difference in regimes?
Why doesn’t pretraining forget?
- Vanilla prompting
- Chain-of-thought (CoT)
- Self-consistency
- Ensemble refinment
- Automatic chain-of-thought (Auto-CoT)
- Complex CoT
- Program-of-thoughts (PoT)
- Least-to-Most
- Chain-of-Symbols (CoS)
- Structured Chain-of-Thought (SCoT)
- Plan-and-solve (PS)
- MathPrompter
- Contrastive CoT/Contrastive self-consistency
- Federated Same/Different Parameter self-consistency/CoT
- Analogical reasoning
- Synthetic prompting
- Tree-of-toughts (ToT)
- Logical Thoughts (LoT)
- Maieutic Prompting
- Verify-and-edit
- Reason + Act (ReACT)
- Active-Prompt
- Thread-of-thought (ThOT)
- Implicit RAG
- System 2 Attention (S2A)
- Instructed prompting
- Chain-of-Verification (CoVe)
- Chain-of-Knowledge (CoK)
- Chain-of-Code (CoC)
- Program-Aided Language Models (PAL)
- Binder
- Dater
- Chain-of-Table
- Decomposed Prompting (DeComp)
- Three-Hop reasoning (THOR)
- Metacognitive Prompting (MP)
- Chain-of-Event (CoE)
- Basic with Term definitions
- Basic + annotation guideline + error-analysis
<OBJECTIVE_AND_PERSONA>
You are a [insert a persona, such as a "math teacher" or "automotive expert"]. Your task is to...
</OBJECTIVE_AND_PERSONA>
<INSTRUCTIONS>
To complete the task, you need to follow these steps:
1.
2.
...
</INSTRUCTIONS>
------------- Optional Components ------------
<CONSTRAINTS>
Dos and don'ts for the following aspects
1. Dos
2. Don'ts
</CONSTRAINTS>
<CONTEXT>
The provided context
</CONTEXT>
<OUTPUT_FORMAT>
The output format must be
1.
2.
...
</OUTPUT_FORMAT>
<FEW_SHOT_EXAMPLES>
Here we provide some examples:
1. Example #1
Input:
Thoughts:
Output:
...
</FEW_SHOT_EXAMPLES>
<RECAP>
Re-emphasize the key aspects of the prompt, especially the constraints, output format, etc.
</RECAP>
TASK:
Classify the OBJECTS.
CLASSES:
- Large
- Small
OBJECTS:
- Rhino
- Mouse
- Snail
- Elephant
What is the most likely interpretation of this sentence? Explain your reasoning. The sentence: “The chef seasoned the chicken and put it in the oven because it looked pale.”
Extract the main issues and sentiments from the customer feedback on our telecom services.
Focus on comments related to service disruptions, billing issues, and customer support interactions.
Please format the output into a list with each issue/sentiment in a sentence, separated by semicolon.
Input: CUSTOMER_FEEDBACK
Classify the extracted issues into categories such as service reliability, pricing concerns, customer support quality, and others.
Please organize the output into JSON format with each issue as the key, and category as the value.
Input: TASK_1_RESPONSE
Generate detailed recommendations for each category of issues identified from the feedback.
Suggest specific actions to address service reliability, improving customer support, and adjusting pricing models, if necessary.
Please organize the output into a JSON format with each category as the key, and recommendation as the value.
Input: TASK_2_RESPONSE
CoT requires large models:
Question What is the elevation range for the area that the eastern sector of the
Colorado orogeny extends into?
Thought 1 I need to search Colorado orogeny, find the area that the eastern sector
of the Colorado orogeny extends into, then find the elevation range of the
area.
Action 1 Search[Colorado orogeny]
Observation 1 The Colorado orogeny was an episode of mountain building (an orogeny) in
Colorado and surrounding areas.
Thought 2 It does not mention the eastern sector. So I need to look up eastern
sector.
Action 2 Lookup[eastern sector]
Observation 2 (Result 1 / 1) The eastern sector extends into the High Plains and is called
the Central Plains orogeny.
Thought 3 The eastern sector of Colorado orogeny extends into the High Plains. So I
need to search High Plains and find its elevation range.
Aside from the Apple Remote, what other devices can control the program Apple Remote was originally designed to interact with?
import dspy
lm = dspy.LM(model="ollama/qwen2.5", api_base="http://localhost:11434")
dspy.configure(lm=lm)
qa = dspy.Predict('question: str -> response: str')
response = qa(question="what are high memory and low memory on linux?")
print(response.response)
dspy.inspect_history(n=1)
cot = dspy.ChainOfThought('question -> response')
res = cot(question="should curly braces appear on their own line?")
print(res.response)
dspy.inspect_history(n=1)
from dspy.datasets import MATH
dataset = MATH(subset='algebra')
dev = dataset.dev[0:10]
example = dataset.train[0]
print("Question:", example.question)
print("Answer:", example.answer)
module = dspy.ChainOfThought("question -> answer")
print(module(question=example.question))
evaluate = dspy.Evaluate(devset=dev, metric=dataset.metric)
evaluate(module)
Implementing a RAG with DSPy; required imports:
import dspy
from dspy.teleprompt import BootstrapFewShot
from dspy.datasets import HotPotQA
from dspy.evaluate import Evaluate
from sentence_transformers import SentenceTransformer
import pandas as pd
from dspy.evaluate.evaluate import Evaluate
passages0 = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-bioasq/data/passages.parquet/part.0.parquet")
test = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-bioasq/data/test.parquet/part.0.parquet")
passages = passages0[0:20]
class RetrievalModel(dspy.Retrieve):
def __init__(self, passages):
self.passages = passages
self.passages["valid"] = self.passages.passage.apply(lambda x: len(x.split(' ')) > 20)
self.passages = self.passages[self.passages.valid]
self.passages = self.passages.reset_index()
for i,x in enumerate(self.passages.passage.tolist()):
print("DOC",i,x)
self.model = SentenceTransformer("all-MiniLM-L6-v2")
self.passage_embeddings = self.model.encode(self.passages.passage.tolist())
def __call__(self, query, k):
query_embedding = self.model.encode(query)
similarities = self.model.similarity(query_embedding, self.passage_embeddings).numpy() # cosine similarities
top_indices = similarities[0, :].argsort()[::-1][:k] # pick TopK documents having highest cosine similarity
response = self.passages.loc[list(top_indices)]
response = response.passage.tolist()
return [dspy.Prediction(long_text= psg) for psg in response]
rm = RetrievalModel(passages)
qq = "Which cell may suffer from anemia?"
print(rm(qq,2))
llm = dspy.LM(model="ollama/qwen2.5:0.5b", api_base="http://localhost:11434")
print("llm ok")
dspy.settings.configure(lm=llm,rm=rm)
class GenerateAnswer(dspy.Signature):
"""Answer questions with short factoid answers."""
context = dspy.InputField(desc="may contain relevant facts")
question = dspy.InputField()
answer = dspy.OutputField(desc="often between 1 and 5 words")
class RAG(dspy.Module):
def __init__(self, num_passages=2):
super().__init__()
self.retrieve = dspy.Retrieve(k=num_passages)
self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
def forward(self, question):
context = self.retrieve(question).passages
prediction = self.generate_answer(context=context, question=question)
return dspy.Prediction(context=context, answer=prediction.answer)
rag = RAG()
pred = rag(qq)
print(f"Question: {qq}")
print(f"Predicted Answer: {pred.answer}")
print(f"Retrieved Contexts (truncated): {[c[:200] + '...' for c in pred.context]}")
llm.inspect_history(n=1)
dataset = []
for index, row in test.iterrows():
dataset.append(dspy.Example(question=row.question, answer=row.answer).with_inputs("context", "question"))
trainset, devset = dataset[:4], dataset[17:20]
def validate_context_and_answer(example, pred, trace=None):
answer_EM = dspy.evaluate.answer_exact_match(example, pred)
answer_PM = dspy.evaluate.answer_passage_match(example, pred)
return answer_EM and answer_PM
valuate_on_devset = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=10)
evalres = evaluate_on_devset(rag, metric=validate_context_and_answer)
print(f"Evaluation Result: {evalres}")
teleprompter = BootstrapFewShot(metric=validate_context_and_answer, max_bootstrapped_demos=2, max_labeled_demos=2)
compiled_rag = teleprompter.compile(rag, trainset=trainset)
evalres = evaluate_on_devset(compiled_rag, metric=validate_context_and_answer)
print(f"Evaluation Result: {evalres}")
llm.inspect_history(n=1)
import dspy
from dsp.utils import deduplicate
from dspy.teleprompt import BootstrapFewShot
from dspy.retrieve.qdrant_rm import QdrantRM
from qdrant_client import QdrantClient
formatted_list = ["Phone Name: HTC Desire 610 8GB Unlocked GSM 4G LTE Quad-Core Android 4.4 Smartphone - Black (No Warranty)\nReview: The phone is very good , takes very sharp pictures but the screen is not bright'",
"Phone Name: Apple iPhone 6, Space Gray, 128 GB (Sprint)\nReview: I am very satisfied with the purchase, i got my iPhone 6 on time and even received a screen protectant with a charger. Thank you so much for the iPhone 6, it was worth the wait.",
]
client = QdrantClient(":memory:")
def add_documents(client, collection_name, formatted_list, batch_size=10):
for i in range(0, len(formatted_list), batch_size):
batch = formatted_list[i:i + batch_size]
batch_ids = list(range(i + 1, i + 1 + len(batch)))
client.add(
collection_name=collection_name,
documents=batch,
ids=batch_ids
)
print(f"Batch {i // batch_size + 1} added with {len(batch)} documents.")
add_documents(client, "phone_collection", formatted_list)
qdrant_retriever_model = QdrantRM("phone_collection", client)
dspy.settings.configure(lm= llm, rm=qdrant_retriever_model)
class Multihoprag(dspy.Module):
def __init__(self, passages_per_hop=3, max_hops=2):
super().__init__()
self.generate_query = [dspy.ChainOfThought(GenerateSearchQuery) for _ in range(max_hops)]
self.retrieve = dspy.Retrieve(k=passages_per_hop)
self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
self.max_hops = max_hops
def forward(self, question):
context = []
for hop in range(self.max_hops):
query = self.generate_query[hop](context=context, question=question).query
passages = self.retrieve(query).passages
context = deduplicate(context + passages)
pred = self.generate_answer(context=context, question=question)
return dspy.Prediction(context=context, answer=pred.answer)
trainset_list = [
{
"Question": "Which phones have the best camera quality and battery life based on recent reviews and specifications?",
"Answer": "Here's a list of phones that meet your criteria:\n\n1. Samsung Galaxy S21 Ultra\n2. Google Pixel 6 Pro\n3. Apple iPhone 13 Pro Max\n4. OnePlus 9 Pro\n5. Xiaomi Mi 11 Ultra\n\nNotes: These phones were picked based on their high ratings for camera quality and long-lasting battery life, as reported by recent reviews and detailed specifications."
},
{
"Question": "What are the top-rated phones with the best display and performance in the market right now?",
"Answer": "Here's a list of phones that meet your criteria:\n\n1. Samsung Galaxy S22\n2. Apple iPhone 14 Pro\n3. OnePlus 10 Pro\n\nNotes: These phones were selected because they have received excellent reviews for their display clarity and performance speed, making them ideal for users seeking high-quality visuals and efficient processing."
},
{
"Question": "Can you recommend phones that have the best user interface and build quality according to recent user reviews?",
"Answer": "Here's a list of phones that meet your criteria:\n\n1. Nokia 8.3 5G\n2. Sony Xperia 1 III\n\nNotes: These phones were chosen due to their outstanding user interface design and robust build quality, which have been highly praised in recent user reviews and expert evaluations."
}
]
trainset = [dspy.Example(question=item["Question"], answer=item["Answer"]).with_inputs('question') for item in trainset_list]
# metric function that prefers short and non-repetitive answers
def validate_answer_and_hops(example, pred, trace=None):
# if not validate(pred.answer == example.answer): return False
hops = [example.question] + [outputs.query for *_, outputs in trace if 'query' in outputs]
if max([len(h) for h in hops]) > 100: return False
if any(dspy.evaluate.answer_exact_match_str(hops[idx], hops[:idx], frac=0.8) for idx in range(2, len(hops))): return False
return True
teleprompter = BootstrapFewShot(metric=validate_answer_and_hops, )
uncompiled_rag = Multihoprag()
compiled_rag = teleprompter.compile(student=uncompiled_rag, trainset= trainset)
print(uncompiled_rag("Which smartphones are highly rated for its low-light camera performance also have a great front camera"))
print(compiled_rag("Which smartphones are highly rated for its low-light camera performance also have a great front camera"))
There exist several libraries to manipulate LLM. The one you shall choose mainly depends on two factors: your target task, and your available hardware. You can find below a small selection of libraries grouped by task. Of course, there exist several other powerful libraries for LLMs, but these are the most common ones as of August 2024. Thereafter, we will focus on the ollama library.
ollama is designed to be easy to use locally to try out open source LLMs. with just one or a few commands, it downloads a quantized LLM locally and launch an OpenAI-compatible server, which you may interact with using one of the many of the available chatGPT-compatible clients. (Personal note: my preferred client is quite geeky, pure linux command-line: charm.sh mods). Ollama also gives simple command-line scripts to immediatly start chatting with the LLM, without any server. As of August 2024, it’s one of the preferred way to quickly start using an LLM.
ollama run llama3.1
ollama run gemma2:2b
Any LLM is limited to the knowledge it has been trained on (and so its date cutoff), and to only interact through text. A major trend mid-2024 is to let LLMs interact with external tools, such as a calculator, a web search engine, a python script execution sandbox… The underlying principle is to finetune the LLM to generate a special structured text format, where the LLM writes the ID of some external tool and its arguments. Then, the program that is calling the LLM can interpret this structured text format and execute the call to the actual external tool specified. Then, we can continue our conversation with the LLM by feeding it with the answer from the tool.
One important missing part: before doing all this, you must give ollama a list of available external tools that can be used. This is done by installing the ollama pip library, which enables you tou call ollama in python and define one python method for each tool.
Important: When listing the tools/python methods for ollama, it’s important to clearly describe what each method is doing as well as each of its arguments in plain English, as the LLM will decide to call a given tool based on its description!
Let’s now put in into practice:
import ollama
import requests
import json
messages = [{'role': 'user', 'content': 'What is the main news right now in the USA?'}]
def getnews(c):
c=c.lower().strip()
if c=='france': s='fr'
elif c=='india': s='in'
elif c=='usa': s='us'
elif c=='australia': s='au'
elif c=='russia': s='ru'
elif c=='united kingdom': s='gb'
else:
print("unknown country",c)
s='fr'
url="https://saurav.tech/NewsAPI/top-headlines/category/general/"+s+".json"
print("calling fct")
response = requests.get(url)
res = response.text
print("tool res",res)
print("\n"*5)
n=json.loads(res)
r=n['articles'][0]['title']+": "+n['articles'][0]['content']
print("extracting news",r,"\n"*3)
return r
def main():
response = ollama.chat(
model='llama3.1',
messages=messages,
tools=[
{
'type': 'function',
'function': {
'name': 'getnews',
'description': 'Get recent news from a country',
'parameters': {
'type': 'object',
'properties': {
'country': {
'type': 'string',
'description': 'The name of the country',
},
},
'required': ['country'],
},
},
},
],
)
# Add the model's response to the conversation history
messages.append(response['message'])
print("first answer",response['message'])
# Check if the model decided to use the provided function
if not response['message'].get('tool_calls'):
print("The model didn't use the function. Its response was:")
print(response['message']['content'])
return
# Process function calls made by the model
if response['message'].get('tool_calls'):
available_functions = {
'getnews': getnews,
}
for tool in response['message']['tool_calls']:
function_to_call = available_functions[tool['function']['name']]
function_response = function_to_call(tool['function']['arguments']['country'])
# Add function response to the conversation
messages.append(
{
'role': 'tool',
'content': function_response,
}
)
# Second API call: Get final response from the model
final_response = ollama.chat(model='llama3.1', messages=messages)
print(final_response['message']['content'])
main()
# adapted from https://github.com/ollama/ollama-python/blob/main/examples/tools/main.py
debug model: https://blog.slavv.com/37-reasons-why-your-neural-network-is-not-working-4020854bd607