┌─────────────────────────┐
│ │
│ Knowledge Graph │
│ on NebulaGraph │
│ │
│ .───. │
│ ┌──▶( ) │
│ │ `───' │
┌────────────────────┐ │ .───. │ │
│ Data Sources │ │ ( )───┘ │
│ │ Extract │ `───' │
│ Database, Wikepedia│━━With LLM━━━▶ │ .───. │
│ CSV, JSON Files │ │ └───────▶( ) │
│ Web APIs... │ │ `───' │
└────────────────────┘ │ ▲ │
│ │ │
│ │ .───. │
│ └─( )│
│ `───' │
│ │
│ │
└─────────────────────────┘
# Only For OpenAI
import os
os.environ['OPENAI_API_KEY'] = "INSERT OPENAI KEY"
import logging
import sys
logging.basicConfig(stream=sys.stdout, level=logging.INFO) # logging.DEBUG for more verbose output
from llama_index import (
KnowledgeGraphIndex,
LLMPredictor,
ServiceContext,
SimpleDirectoryReader,
)
from llama_index.storage.storage_context import StorageContext
from llama_index.graph_stores import NebulaGraphStore
from langchain import OpenAI
from IPython.display import Markdown, display
# define LLM
llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="text-davinci-002"))
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, chunk_size_limit=512)
# Only For Azure OpenAI
import os
import json
import openai
from langchain.llms import AzureOpenAI
from langchain.embeddings import OpenAIEmbeddings
from llama_index import LangchainEmbedding
from llama_index import (
VectorStoreIndex,
SimpleDirectoryReader,
KnowledgeGraphIndex,
LLMPredictor,
ServiceContext
)
from llama_index.storage.storage_context import StorageContext
from llama_index.graph_stores import NebulaGraphStore
import logging
import sys
from IPython.display import Markdown, display
logging.basicConfig(stream=sys.stdout, level=logging.INFO) # logging.DEBUG for more verbose output
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
openai.api_type = "azure"
openai.api_base = "INSERT AZURE API BASE"
openai.api_version = "2022-12-01"
os.environ["OPENAI_API_KEY"] = "INSERT OPENAI KEY"
openai.api_key = os.getenv("OPENAI_API_KEY")
# define LLM
llm = AzureOpenAI(
deployment_name="INSERT DEPLOYMENT NAME",
temperature=0,
openai_api_version=openai.api_version,
model_kwargs={
"api_key": openai.api_key,
"api_base": openai.api_base,
"api_type": openai.api_type,
"api_version": openai.api_version,
}
)
llm_predictor = LLMPredictor(llm=llm)
# You need to deploy your own embedding model as well as your own chat completion model
embedding_llm = LangchainEmbedding(
OpenAIEmbeddings(
model="text-embedding-ada-002",
deployment="INSERT DEPLOYMENT NAME",
openai_api_key=openai.api_key,
openai_api_base=openai.api_base,
openai_api_type=openai.api_type,
openai_api_version=openai.api_version,
),
embed_batch_size=1,
)
service_context = ServiceContext.from_defaults(
llm_predictor=llm_predictor,
embed_model=embedding_llm,
)
❗Access NebulaGraph Console to create space and graph schema
CREATE SPACE guardians(vid_type=FIXED_STRING(256), partition_num=1, replica_factor=1);
:sleep 10;
USE guardians;
CREATE TAG entity(name string);
CREATE EDGE relationship(relationship string);
:sleep 10;
CREATE TAG INDEX entity_index ON entity(name(256));
os.environ['NEBULA_USER'] = "root"
os.environ['NEBULA_PASSWORD'] = "nebula" # default password
os.environ['NEBULA_ADDRESS'] = "127.0.0.1:9669" # assumed we have NebulaGraph installed locally
space_name = "guardians"
edge_types, rel_prop_names = ["relationship"], ["relationship"] # default, could be omit if create from an empty kg
tags = ["entity"] # default, could be omit if create from an empty kg
graph_store = NebulaGraphStore(space_name=space_name, edge_types=edge_types, rel_prop_names=rel_prop_names, tags=tags)
storage_context = StorageContext.from_defaults(graph_store=graph_store)
We will download and preprecess data from: https://en.wikipedia.org/wiki/Guardians_of_the_Galaxy_Vol._3
from llama_index import download_loader
WikipediaReader = download_loader("WikipediaReader")
loader = WikipediaReader()
documents = loader.load_data(pages=['Guardians of the Galaxy Vol. 3'], auto_suggest=False)
We will persist it to disk and NebulaGraph, thus when using it, we don't need to extract again.
kg_index = KnowledgeGraphIndex.from_documents(
documents,
storage_context=storage_context,
max_triplets_per_chunk=10,
service_context=service_context,
space_name=space_name,
edge_types=edge_types,
rel_prop_names=rel_prop_names,
tags=tags,
include_embeddings=True,
)
Let's persist the context from memory to disk
kg_index.storage_context.persist(persist_dir='./storage_graph')
The files are generated:
!ls -l storage_graph
total 9120 -rw-r--r--@ 1 weyl staff 66922 Jul 12 20:26 docstore.json -rw-r--r--@ 1 weyl staff 4594860 Jul 12 20:26 index_store.json -rw-r--r--@ 1 weyl staff 51 Jul 12 20:26 vector_store.json
We will leverage NebulaGraph Jupyter Extension, do remember to install it before next step:
$ pip install ipython-ngql
%load_ext ngql
%ngql --address 127.0.0.1 --port 9669 --user root --password nebula
%ngql USE guardians
We could query 30 random edges:
%ngql MATCH ()-[e]->() RETURN e LIMIT 30
INFO:nebula3.logger:Get connection to ('127.0.0.1', 9669) Get connection to ('127.0.0.1', 9669)
e | |
---|---|
0 | ("Adam Warlock")-[:relationship@98688268702526... |
1 | ("Alan F. Horn")-[:relationship@-3866030880391... |
2 | ("Alan F. Horn")-[:relationship@-3866030880391... |
3 | ("Bakalova")-[:relationship@-78310709996010382... |
4 | ("Bakalova")-[:relationship@-18287293525973127... |
5 | ("Bautista")-[:relationship@262829015229588616... |
6 | ("Bautista")-[:relationship@264209192087427643... |
7 | ("Chris Pratt")-[:relationship@-53886203992796... |
8 | ("Christopher Fairbank")-[:relationship@704429... |
9 | ("Cooper")-[:relationship@2642091920874276436{... |
10 | ("Daniela Melchior")-[:relationship@5794733688... |
11 | ("Dave Bautista")-[:relationship@-538862039927... |
12 | ("Debicki")-[:relationship@2682825685616935037... |
13 | ("Diesel")-[:relationship@2642091920874276436{... |
14 | ("Disney")-[:relationship@-7269035608107002438... |
15 | ("Disney")-[:relationship@4594936970614874383{... |
16 | ("Drax")-[:relationship@1274897091364343563{re... |
17 | ("Elizabeth Debicki")-[:relationship@704429536... |
18 | ("Gamora")-[:relationship@2108090488737331578{... |
19 | ("Gamora")-[:relationship@4452575226635738814{... |
20 | ("Gamora")-[:relationship@7254563908946132317{... |
21 | ("George MacKay")-[:relationship@2027380399406... |
22 | ("Gillan")-[:relationship@-1827525784919523442... |
23 | ("Gillan")-[:relationship@1278621438198917644{... |
24 | ("Gillan")-[:relationship@2642091920874276436{... |
25 | ("Gillan")-[:relationship@7823655194542812825{... |
26 | ("Gregg Henry")-[:relationship@704429536949728... |
27 | ("Guardians cast")-[:relationship@-64051353433... |
28 | ("Guardians of the Galaxy")-[:relationship@790... |
29 | ("Guardians of the Galaxy Vol. 3")-[:relations... |
And draw it:
%ng_draw
nebulagraph_draw.html
Now we have a Knowledge Graph built on top of Wikipedia. With NebulaGraph LLM tooling, we could query the KG in Natural language(NL2Cypher).
First, let's use Llma Index:
from llama_index.query_engine import KnowledgeGraphQueryEngine
from llama_index.storage.storage_context import StorageContext
from llama_index.graph_stores import NebulaGraphStore
nl2kg_query_engine = KnowledgeGraphQueryEngine(
storage_context=storage_context,
service_context=service_context,
llm=llm,
verbose=True,
)
We could see KnowledgeGraphQueryEngine
could be used to Generate Graph Query and do query for us and fianlly LLM could help with the answer synthesis in one go!
response = nl2kg_query_engine.query(
"Tell me about Peter Quill?",
)
display(Markdown(f"<b>{response}</b>"))
Graph Store Query: MATCH (p:`entity`)-[:relationship]->(e:`entity`) WHERE p.`entity`.`name` == 'Peter Quill' RETURN e.`entity`.`name`; INFO:llama_index.query_engine.knowledge_graph_query_engine:Graph Store Query: MATCH (p:`entity`)-[:relationship]->(e:`entity`) WHERE p.`entity`.`name` == 'Peter Quill' RETURN e.`entity`.`name`; Graph Store Query: MATCH (p:`entity`)-[:relationship]->(e:`entity`) WHERE p.`entity`.`name` == 'Peter Quill' RETURN e.`entity`.`name`; Graph Store Response: {'e.entity.name': ['Guardians of the Galaxy']} INFO:llama_index.query_engine.knowledge_graph_query_engine:Graph Store Response: {'e.entity.name': ['Guardians of the Galaxy']} Graph Store Response: {'e.entity.name': ['Guardians of the Galaxy']} Final Response: Peter Quill is a character from the Marvel Comics series Guardians of the Galaxy.
💡 Apart from the e2e KGQA, we could ask for only NL2Cypher like this with generate_query
.
graph_query = nl2kg_query_engine.generate_query(
"Tell me about Peter Quill?",
)
graph_query = graph_query.replace("WHERE", "\n WHERE").replace("RETURN", "\nRETURN")
display(
Markdown(
f"""
```cypher
{graph_query}
```
"""
)
)
MATCH (p:`entity`)-[:relationship]->(e:`entity`)
WHERE p.`entity`.`name` == 'Peter Quill'
RETURN e.`entity`.`name`;
Then, of course we could run the query by ourselves with it!
%%ngql
MATCH (p:`entity`)-[:relationship]->(e:`entity`)
WHERE p.`entity`.`name` == 'Peter Quill'
RETURN e.`entity`.`name`;
INFO:nebula3.logger:Get connection to ('127.0.0.1', 9669) Get connection to ('127.0.0.1', 9669)
e.entity.name | |
---|---|
0 | Guardians of the Galaxy |
Or we changed the return part to whole path, for drawing it!
%%ngql
MATCH path_0=(p:`entity`)-[:relationship]->(e:`entity`)
WHERE p.`entity`.`name` == 'Peter Quill'
RETURN path_0;
INFO:nebula3.logger:Get connection to ('127.0.0.1', 9669) Get connection to ('127.0.0.1', 9669)
path_0 | |
---|---|
0 | ("Peter Quill" :entity{name: "Peter Quill"})-[... |
%ng_draw
nebulagraph_draw.html
!mv nebulagraph_draw.html nebulagraph_draw_nl2cypher.html
from langchain.chat_models import ChatOpenAI
from langchain.chains import NebulaGraphQAChain
from langchain.graphs import NebulaGraph
graph = NebulaGraph(
space=space_name,
username="root",
password="nebula",
address="127.0.0.1",
port=9669,
session_pool_size=30,
)
chain = NebulaGraphQAChain.from_llm(
llm, graph=graph, verbose=True
)
chain.run(
"Tell me about Peter Quill?",
)
> Entering new chain... Generated nGQL: MATCH (p:`entity`)-[e:relationship]->(m:`entity`) WHERE p.`entity`.`name` == 'Peter Quill' RETURN p.`entity`.`name`, e.relationship, m.`entity`.`name`; Full Context: {'p.entity.name': ['Peter Quill'], 'e.relationship': ['is leader of'], 'm.entity.name': ['Guardians of the Galaxy']} > Finished chain.
' Peter Quill is the leader of the Guardians of the Galaxy.'
Apart from the NL2Cypher fashion of exploiting KG in QA, especially for complex tasks, we could also do it in the Retrieval Arguments Generation way.
from llama_index import load_index_from_storage
storage_context_graph = StorageContext.from_defaults(persist_dir='./storage_graph', graph_store=graph_store)
kg_index_new = load_index_from_storage(
storage_context=storage_context_graph,
service_context=service_context,
max_triplets_per_chunk=10,
space_name=space_name,
edge_types=edge_types,
rel_prop_names=rel_prop_names,
tags=tags,
include_embeddings=True,
)
INFO:llama_index.indices.loading:Loading all indices. Loading all indices.
kg_rag_query_engine = kg_index_new.as_query_engine(
include_text=False,
retriever_mode='keyword',
response_mode="tree_summarize",
)
response = kg_rag_query_engine.query(
"Tell me about Peter Quill?"
)
display(Markdown(f"<b>{response}</b>"))
INFO:llama_index.indices.knowledge_graph.retriever:> Starting query: Tell me about Peter Quill? > Starting query: Tell me about Peter Quill? INFO:llama_index.indices.knowledge_graph.retriever:> Query keywords: ['biography', 'Peter Quill', 'Peter', 'Quill', 'information'] > Query keywords: ['biography', 'Peter Quill', 'Peter', 'Quill', 'information'] INFO:llama_index.indices.knowledge_graph.retriever:> Extracted relationships: The following are knowledge triplets in max depth 2 in the form of `subject [predicate, object, predicate_next_hop, object_next_hop ...]` Peter Quill ['is leader of', 'Guardians of the Galaxy', 'released in', '2014'] Peter Quill ['portrays', 'Peter Quill'] Peter Quill ['is leader of', 'Guardians of the Galaxy', 'reprised role from', 'Guardians of the Galaxy'] Peter Quill ['is leader of', 'Guardians of the Galaxy'] Peter Quill ['is leader of', 'Guardians of the Galaxy', 'directed', 'Guardians of the Galaxy'] Peter Quill ['is leader of', 'Guardians of the Galaxy', 'wrote', 'Guardians of the Galaxy'] Peter Quill ['is leader of', 'Guardians of the Galaxy', 'sequel to', 'Guardians of the Galaxy'] Quill ['speaks', ' fuck '] > Extracted relationships: The following are knowledge triplets in max depth 2 in the form of `subject [predicate, object, predicate_next_hop, object_next_hop ...]` Peter Quill ['is leader of', 'Guardians of the Galaxy', 'released in', '2014'] Peter Quill ['portrays', 'Peter Quill'] Peter Quill ['is leader of', 'Guardians of the Galaxy', 'reprised role from', 'Guardians of the Galaxy'] Peter Quill ['is leader of', 'Guardians of the Galaxy'] Peter Quill ['is leader of', 'Guardians of the Galaxy', 'directed', 'Guardians of the Galaxy'] Peter Quill ['is leader of', 'Guardians of the Galaxy', 'wrote', 'Guardians of the Galaxy'] Peter Quill ['is leader of', 'Guardians of the Galaxy', 'sequel to', 'Guardians of the Galaxy'] Quill ['speaks', ' fuck ']
%%ngql
MATCH path0=(p:`entity`)-[*1..2]-() WHERE p.`entity`.`name` == 'Peter Quill'
RETURN path0;
INFO:nebula3.logger:Get connection to ('127.0.0.1', 9669) Get connection to ('127.0.0.1', 9669)
path0 | |
---|---|
0 | ("Peter Quill" :entity{name: "Peter Quill"})-[... |
1 | ("Peter Quill" :entity{name: "Peter Quill"})<-... |
2 | ("Peter Quill" :entity{name: "Peter Quill"})-[... |
3 | ("Peter Quill" :entity{name: "Peter Quill"})-[... |
4 | ("Peter Quill" :entity{name: "Peter Quill"})-[... |
5 | ("Peter Quill" :entity{name: "Peter Quill"})-[... |
6 | ("Peter Quill" :entity{name: "Peter Quill"})-[... |
7 | ("Peter Quill" :entity{name: "Peter Quill"})-[... |
8 | ("Peter Quill" :entity{name: "Peter Quill"})-[... |
9 | ("Peter Quill" :entity{name: "Peter Quill"})-[... |
10 | ("Peter Quill" :entity{name: "Peter Quill"})-[... |
%ng_draw
nebulagraph_draw.html
!mv nebulagraph_draw.html nebulagraph_draw_rag.html