TechnologyFebruary 28, 2024

GenAI Data Ingestion Just Got Easier with Unstructured.io and Astra DB

Eric Hare
Eric HareSoftware engineer
GenAI Data Ingestion Just Got Easier with Unstructured.io and Astra DB
pip install "unstructured[all-docs]"
pip install "unstructured[astra]"

pip install llama-index-embeddings-huggingface
from unstructured.partition.html import partition_html

url = "https://www.datastax.com/pricing/astra-db"
elements = partition_html(url=url)
print("\n\n".join([str(el) for el in elements]))
import os

from dotenv import load_dotenv

from unstructured.ingest.runner.writers.base_writer import Writer
from unstructured.ingest.runner.writers.astra import AstraWriter

from unstructured.partition.html import partition_html

load_dotenv()

url = "https://www.datastax.com/pricing/astra-db"
elements = partition_html(url=url)

if not os.path.exists("local-input-to-astra"):
   os.makedirs("local-input-to-astra")

for elem in elements:
   # Write the text to local txt files
   with open(f"local-input-to-astra/{elem.id}.txt", "w") as f:
       f.write(elem.text)

from unstructured.ingest.connector.local import SimpleLocalConfig
from unstructured.ingest.connector.astra import (
   AstraAccessConfig,
   AstraWriteConfig,
   SimpleAstraConfig,
)
from unstructured.ingest.interfaces import (
   ChunkingConfig,
   EmbeddingConfig,
   PartitionConfig,
   ProcessorConfig,
   ReadConfig,
)
from unstructured.ingest.runner import LocalRunner
from unstructured.ingest.runner.writers.base_writer import Writer
from unstructured.ingest.runner.writers.astra import (
   AstraWriter,
)

def get_writer() -> Writer:
   return AstraWriter(
       connector_config=SimpleAstraConfig(
           access_config=AstraAccessConfig(
               api_endpoint=os.getenv("ASTRA_DB_API_ENDPOINT"),
               token=os.getenv("ASTRA_DB_APPLICATION_TOKEN"),
           ),
           collection_name=os.getenv("ASTRA_DB_COLLECTION_NAME", "unstructured"),
           embedding_dimension=os.getenv("ASTRA_DB_EMBEDDING_DIMENSION", 384),
       ),
       write_config=AstraWriteConfig(batch_size=20),
   )

writer = get_writer()
runner = LocalRunner(
   processor_config=ProcessorConfig(
       verbose=True,
       output_dir="local-output-to-astra",
       num_processes=2,
   ),
   connector_config=SimpleLocalConfig(
       input_path="local-input-to-astra",
   ),
   read_config=ReadConfig(),
   partition_config=PartitionConfig(),
   chunking_config=ChunkingConfig(chunk_elements=True),
   embedding_config=EmbeddingConfig(
       provider="langchain-huggingface",
   ),
   writer=writer,
   writer_kwargs={},
)
runner.run()
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.astra import AstraDBVectorStore

astra_db_store = AstraDBVectorStore(
   token=os.getenv("ASTRA_DB_APPLICATION_TOKEN"),
   api_endpoint=os.getenv("ASTRA_DB_API_ENDPOINT"),
   collection_name=os.getenv("ASTRA_DB_COLLECTION_NAME", "unstructured"),
   embedding_dimension=os.getenv("ASTRA_DB_EMBEDDING_DIMENSION", 384),
)

index = VectorStoreIndex.from_vector_store(
   vector_store=astra_db_store,
   embed_model=HuggingFaceEmbedding(
       model_name="BAAI/bge-small-en-v1.5"
   )
)

query_engine = index.as_query_engine()
response = query_engine.query(
   "how much is the astra db free tier?"
)

print(response.response)
The Astra DB free tier provides $25 monthly credit for the first three months, allowing users to explore the service without incurring costs during this initial period.
Discover more
Vector SearchRetrieval-augmented generation
Share

One-stop Data API for Production GenAI

Astra DB gives JavaScript developers a complete data API and out-of-the-box integrations that make it easier to build production RAG apps with high relevancy and low latency.