Skip to main content
Add documents to knowledge bases using methods like add_pdf(), add_website(), and add_text(). Documents are automatically chunked and vectorized for retrieval.

Quick Start

from lyzr import Studio

studio = Studio(api_key="your-api-key")

kb = studio.create_knowledge_base(name="my_docs")

# Add various document types
kb.add_pdf("manual.pdf")
kb.add_docx("report.docx")
kb.add_txt("notes.txt")
kb.add_website("https://docs.example.com", max_pages=50)
kb.add_text("Custom FAQ content", source="faq")

add_pdf()

Add a PDF document to the knowledge base.
kb.add_pdf(
    file_path: str,
    chunk_size: int = 1024,
    chunk_overlap: int = 128,
    data_parser: str = None,
    extra_info: str = None
) -> bool

Parameters

ParameterTypeDefaultDescription
file_pathstrRequiredPath to PDF file
chunk_sizeint1024Size of text chunks in characters
chunk_overlapint128Overlap between chunks
data_parserstr"llmsherpa"PDF parser to use
extra_infostrNoneExtra metadata as JSON string

Examples

# Basic usage
kb.add_pdf("document.pdf")

# With custom chunking
kb.add_pdf(
    "document.pdf",
    chunk_size=2048,
    chunk_overlap=256
)

# With metadata
kb.add_pdf(
    "manual.pdf",
    extra_info='{"version": "2.0", "department": "support"}'
)

add_docx()

Add a Word document to the knowledge base.
kb.add_docx(
    file_path: str,
    chunk_size: int = 1024,
    chunk_overlap: int = 128,
    data_parser: str = None,
    extra_info: str = None
) -> bool

Parameters

ParameterTypeDefaultDescription
file_pathstrRequiredPath to DOCX file
chunk_sizeint1024Size of text chunks
chunk_overlapint128Overlap between chunks
data_parserstr"docx2txt"Document parser
extra_infostrNoneExtra metadata

Example

kb.add_docx("report.docx", chunk_size=1500)

add_txt()

Add a plain text file to the knowledge base.
kb.add_txt(
    file_path: str,
    chunk_size: int = 1024,
    chunk_overlap: int = 128,
    data_parser: str = None,
    extra_info: str = None
) -> bool

Parameters

ParameterTypeDefaultDescription
file_pathstrRequiredPath to TXT file
chunk_sizeint1024Size of text chunks
chunk_overlapint128Overlap between chunks
data_parserstr"simple"Text parser
extra_infostrNoneExtra metadata

Example

kb.add_txt("notes.txt")

add_website()

Add website content to the knowledge base with optional crawling.
kb.add_website(
    url: str | List[str],
    source: str = "website",
    max_pages: int = 1,
    max_depth: int = 0,
    chunk_size: int = 1024,
    chunk_overlap: int = 128,
    dynamic_content_wait_secs: int = None,
    crawler_type: str = None
) -> bool

Parameters

ParameterTypeDefaultDescription
urlstr | List[str]RequiredURL or list of URLs
sourcestr"website"Source identifier
max_pagesint1Maximum pages to crawl
max_depthint0Maximum crawl depth (0 = single page)
chunk_sizeint1024Size of text chunks
chunk_overlapint128Overlap between chunks
dynamic_content_wait_secsint5Wait time for dynamic content
crawler_typestr"cheerio"Crawler type

Examples

# Single page
kb.add_website("https://docs.example.com")

# Crawl multiple pages
kb.add_website(
    "https://docs.example.com",
    max_pages=50,
    max_depth=3
)

# Multiple URLs
kb.add_website([
    "https://example.com/page1",
    "https://example.com/page2",
    "https://example.com/page3"
])

# Documentation site
kb.add_website(
    "https://docs.example.com/getting-started",
    max_pages=100,
    max_depth=5,
    source="documentation"
)

# Wait for dynamic content (SPAs)
kb.add_website(
    "https://app.example.com/docs",
    dynamic_content_wait_secs=10
)

add_text()

Add raw text content directly to the knowledge base.
kb.add_text(
    text: str,
    source: str,
    chunk_size: int = 1024,
    chunk_overlap: int = 128
) -> bool

Parameters

ParameterTypeDefaultDescription
textstrRequiredText content to add
sourcestrRequiredSource identifier
chunk_sizeint1024Size of text chunks
chunk_overlapint128Overlap between chunks

Examples

# Add FAQ content
kb.add_text(
    "Q: What are your business hours?\nA: We're open 9am-5pm PST, Monday-Friday.",
    source="faq"
)

# Add multiple text entries
faqs = [
    ("What is your return policy?", "30-day money-back guarantee"),
    ("How do I contact support?", "Email support@example.com"),
    ("What payment methods do you accept?", "Visa, Mastercard, PayPal")
]

for question, answer in faqs:
    kb.add_text(
        f"Q: {question}\nA: {answer}",
        source="faq"
    )

Chunking Configuration

Documents are split into chunks for efficient retrieval. Configure chunking to optimize for your use case:

Small Chunks (Precise Retrieval)

kb.add_pdf(
    "document.pdf",
    chunk_size=512,
    chunk_overlap=64
)
Best for:
  • FAQ-style content
  • Technical documentation
  • When precision is important

Large Chunks (More Context)

kb.add_pdf(
    "document.pdf",
    chunk_size=2048,
    chunk_overlap=256
)
Best for:
  • Narrative content
  • Legal documents
  • When context is important

Bulk Document Loading

import os

# Add all PDFs from a directory
pdf_dir = "./documents"
for filename in os.listdir(pdf_dir):
    if filename.endswith(".pdf"):
        kb.add_pdf(os.path.join(pdf_dir, filename))
        print(f"Added: {filename}")

Examples

Documentation Website

kb = studio.create_knowledge_base(name="product_docs")

# Add main documentation
kb.add_website(
    "https://docs.example.com",
    max_pages=200,
    max_depth=5
)

# Add API reference
kb.add_website(
    "https://api.example.com/docs",
    max_pages=50,
    max_depth=2
)

# Add changelog
kb.add_pdf("changelog.pdf")

Support Knowledge Base

kb = studio.create_knowledge_base(name="support_kb")

# Add support articles
kb.add_website("https://support.example.com", max_pages=100)

# Add PDF manuals
kb.add_pdf("user_manual.pdf")
kb.add_pdf("troubleshooting_guide.pdf")

# Add FAQ text
kb.add_text(
    """
    Q: How do I reset my password?
    A: Click 'Forgot Password' on the login page and follow the instructions.

    Q: How do I contact support?
    A: Email support@example.com or call 1-800-EXAMPLE.
    """,
    source="faq"
)

Mixed Content

kb = studio.create_knowledge_base(name="company_kb")

# Internal documents
kb.add_pdf("employee_handbook.pdf")
kb.add_docx("policies.docx")

# External content
kb.add_website("https://blog.company.com", max_pages=50)

# Dynamic content
kb.add_text(
    f"Current quarter: Q1 2024\nRevenue target: $10M",
    source="metrics"
)

Error Handling

from lyzr.exceptions import APIError, ValidationError

try:
    kb.add_pdf("document.pdf")
except FileNotFoundError:
    print("File not found")
except APIError as e:
    print(f"Upload failed: {e.message}")

Processing Time

Document processing can take time, especially for:
  • Large PDFs (many pages)
  • Website crawling (many pages)
  • Complex documents
The ADK uses a 5-minute timeout for document operations. For very large documents, consider splitting them into smaller files.