Process Documents with Visuals: A Guide to Retrieval-Augmented Generation Using GPT-4o Vision

Traditional Retrieval-Augmented Generation (RAG) models excel with textual data, but struggle with documents that heavily rely on images, graphics, and tables. This article shows you how to leverage the vision modality to extract and interpret visual content, ensuring your generated responses are both informative and accurate.

Overcome RAG Limitations: Unlock Visual Understanding

Implementing Retrieval-Augmented Generation with GPT-4o for document understanding allows you to create AI solutions that deliver richer, more accurate information, significantly enhancing user satisfaction and engagement. This means better search results, more complete answers, and a greater overall user experience. Learn how to set up your RAG system to accurately interpret documents with complex visual elements.

Key Concepts: From Setup to Semantic Search

In this guide, you'll explore and implement the following essential concepts:

Vector Store Setup with Pinecone: Initialize and configure Pinecone for efficient vector embeddings storage.
PDF Parsing & Visual Information Extraction: Convert PDF pages into images and use GPT-4o to extract vital textual data from visual elements.
Embedding Generation: Create robust vector representations of your textual data, focusing on pages with visual cues.
Embedding Upload to Pinecone: Store your embeddings for optimal storage and retrieval in Pinecone.
Semantic Search: Pinpoint the most relevant pages based on user queries using semantic search techniques.
Visual Content Handing: Enhance contextual accuracy by passing images using GPT-4o’s vision modality.

Step-by-Step: Building Your Vision-Enabled RAG System

Let’s walk through setting up a vector store with Pinecone.

Step 1: Setting Up Your Pinecone Vector Store

First, you'll set up a vector store using Pinecone to efficiently store and manage your embeddings.

Prerequisites:

Sign up for Pinecone and obtain your API key.
Install the Pinecone SDK: pip install "pinecone[grpc]"
Install python-dotenv: pip install python-dotenv

Make sure to store and access your API key securely.

import os
import time
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from dotenv import load_dotenv

load_dotenv()

api_key = os.getenv("PINECONE_API_KEY")

# Initialize Pinecone client with your API key
pc = Pinecone(api_key)

# Create a serverless index
index_name = "my-test-index"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension = 3072,
        metric = "cosine",
        spec = ServerlessSpec(
            cloud = 'aws',
            region = 'us-east-1'
        )
    )

# Wait for the index to be ready
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

Step 2: Parsing PDFs and Extracting Vital Visual Information

Next, you’ll parse your PDF document and extract textual and visual information such as image and table descriptions.

Prerequisites:

Install the necessary packages: pip install PyPDF2 pdf2image pytesseract pandas tqdm

import base64
import requests
import os
import pandas as pd
from PyPDF2 import PdfReader, PdfWriter
from pdf2image import convert_from_bytes
from io import BytesIO
from openai import OpenAI
from tqdm import tqdm

document_to_parse = "https://documents1.worldbank.org/curated/en/099101824180532047/pdf/BOSIB13bdde89d07f1b3711dd8e86adb477.pdf"
oai_client = OpenAI()

def chunk_document (document_url):
    response = requests.get(document_url)
    pdf_data = response.content
    pdf_reader = PdfReader(BytesIO(pdf_data))
    page_chunks = []

    for page_number, page in enumerate (pdf_reader.pages, start = 1):
        pdf_writer = PdfWriter()
        pdf_writer.add_page(page)
        pdf_bytes_io = BytesIO()
        pdf_writer.write(pdf_bytes_io)
        pdf_bytes_io.seek( 0)
        pdf_bytes = pdf_bytes_io.read()
        page_chunk = {
            'pageNumber': page_number,
            'pdfBytes': pdf_bytes
        }
        page_chunks.append(page_chunk)

    return page_chunks

def encode_image (local_image_path):
    with open (local_image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode( 'utf-8')

def convert_page_to_image (pdf_bytes, page_number):
    images = convert_from_bytes(pdf_bytes)
    image = images[ 0]
    images_dir = 'images'
    os.makedirs(images_dir, exist_ok = True)
    image_file_name = f "page_ { page_number}.png"
    image_file_path = os.path.join(images_dir, image_file_name)
    image.save(image_file_path, 'PNG')

    return image_file_path

def get_vision_response (prompt, image_path):
    base64_image = encode_image(image_path)

    response = oai_client.chat.completions.create(
        model = "gpt-4o",
        messages = [
            {
                "role": "user",
                "content": [
                    { "type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f "data:image/jpeg;base64,{base64_image}"
                        },
                    },
                ],
            }
        ],
    )
    return response

def process_document (document_url):
    try:
        print ( "Document processing started")
        page_chunks = chunk_document(document_url)
        total_pages = len (page_chunks)
        page_data_list = []

        for page_chunk in tqdm(page_chunks, total = total_pages, desc = 'Processing Pages'):
            page_number = page_chunk[ 'pageNumber']
            pdf_bytes = page_chunk[ 'pdfBytes']

            image_path = convert_page_to_image(pdf_bytes, page_number)
            system_prompt = (
                "The user will provide you an image of a document file. Perform the following actions: "
                "1. Transcribe the text on the page. **TRANSCRIPTION OF THE TEXT:**"
                "2. If there is a chart, describe the image and include the text **DESCRIPTION OF THE IMAGE OR CHART**"
                "3. If there is a table, transcribe the table and include the text **TRANSCRIPTION OF THE TABLE**"
            )

            vision_response = get_vision_response(system_prompt, image_path)
            text = vision_response.choices[ 0].message.content

            page_data = {
                'PageNumber': page_number,
                'ImagePath': image_path,
                'PageText': text
            }
            page_data_list.append(page_data)

        pdf_df = pd.DataFrame(page_data_list)
        print ( "Document processing completed.")
        print ( "DataFrame created with page data.")

        return pdf_df
    except Exception as err:
        print ( f "Error processing document: { err} ")

df = process_document(document_to_parse)

This code snippet outlines how to effectively parse PDF documents and harness GPT-4o vision modality to extract meaningful insights from textual and visual elements, improving the accuracy and depth of your RAG system.

The Benefits: Enhanced Accuracy and User Satisfaction

By integrating visual processing into your RAG system, you're ensuring more comprehensive and accurate responses. For scenarios where visual data is critical this can boost user satisfaction and overall engagement. Utilizing GPT-4o vision capability improves the accuracy of responses for visually rich data.

Process Documents with Visuals: A Guide to Retrieval-Augmented Generation Using GPT-4o Vision

Overcome RAG Limitations: Unlock Visual Understanding

Key Concepts: From Setup to Semantic Search

In this guide, you'll explore and implement the following essential concepts:

Vector Store Setup with Pinecone: Initialize and configure Pinecone for efficient vector embeddings storage.
PDF Parsing & Visual Information Extraction: Convert PDF pages into images and use GPT-4o to extract vital textual data from visual elements.
Embedding Generation: Create robust vector representations of your textual data, focusing on pages with visual cues.
Embedding Upload to Pinecone: Store your embeddings for optimal storage and retrieval in Pinecone.
Semantic Search: Pinpoint the most relevant pages based on user queries using semantic search techniques.
Visual Content Handing: Enhance contextual accuracy by passing images using GPT-4o’s vision modality.

Step-by-Step: Building Your Vision-Enabled RAG System

Let’s walk through setting up a vector store with Pinecone.

Step 1: Setting Up Your Pinecone Vector Store

First, you'll set up a vector store using Pinecone to efficiently store and manage your embeddings.

Prerequisites:

Sign up for Pinecone and obtain your API key.
Install the Pinecone SDK: pip install "pinecone[grpc]"
Install python-dotenv: pip install python-dotenv

Make sure to store and access your API key securely.

import os
import time
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from dotenv import load_dotenv

load_dotenv()

api_key = os.getenv("PINECONE_API_KEY")

# Initialize Pinecone client with your API key
pc = Pinecone(api_key)

# Create a serverless index
index_name = "my-test-index"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension = 3072,
        metric = "cosine",
        spec = ServerlessSpec(
            cloud = 'aws',
            region = 'us-east-1'
        )
    )

# Wait for the index to be ready
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

Step 2: Parsing PDFs and Extracting Vital Visual Information

Next, you’ll parse your PDF document and extract textual and visual information such as image and table descriptions.

Prerequisites:

Install the necessary packages: pip install PyPDF2 pdf2image pytesseract pandas tqdm

import base64
import requests
import os
import pandas as pd
from PyPDF2 import PdfReader, PdfWriter
from pdf2image import convert_from_bytes
from io import BytesIO
from openai import OpenAI
from tqdm import tqdm

document_to_parse = "https://documents1.worldbank.org/curated/en/099101824180532047/pdf/BOSIB13bdde89d07f1b3711dd8e86adb477.pdf"
oai_client = OpenAI()

def chunk_document (document_url):
    response = requests.get(document_url)
    pdf_data = response.content
    pdf_reader = PdfReader(BytesIO(pdf_data))
    page_chunks = []

    for page_number, page in enumerate (pdf_reader.pages, start = 1):
        pdf_writer = PdfWriter()
        pdf_writer.add_page(page)
        pdf_bytes_io = BytesIO()
        pdf_writer.write(pdf_bytes_io)
        pdf_bytes_io.seek( 0)
        pdf_bytes = pdf_bytes_io.read()
        page_chunk = {
            'pageNumber': page_number,
            'pdfBytes': pdf_bytes
        }
        page_chunks.append(page_chunk)

    return page_chunks

def encode_image (local_image_path):
    with open (local_image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode( 'utf-8')

def convert_page_to_image (pdf_bytes, page_number):
    images = convert_from_bytes(pdf_bytes)
    image = images[ 0]
    images_dir = 'images'
    os.makedirs(images_dir, exist_ok = True)
    image_file_name = f "page_ { page_number}.png"
    image_file_path = os.path.join(images_dir, image_file_name)
    image.save(image_file_path, 'PNG')

    return image_file_path

def get_vision_response (prompt, image_path):
    base64_image = encode_image(image_path)

    response = oai_client.chat.completions.create(
        model = "gpt-4o",
        messages = [
            {
                "role": "user",
                "content": [
                    { "type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f "data:image/jpeg;base64,{base64_image}"
                        },
                    },
                ],
            }
        ],
    )
    return response

def process_document (document_url):
    try:
        print ( "Document processing started")
        page_chunks = chunk_document(document_url)
        total_pages = len (page_chunks)
        page_data_list = []

        for page_chunk in tqdm(page_chunks, total = total_pages, desc = 'Processing Pages'):
            page_number = page_chunk[ 'pageNumber']
            pdf_bytes = page_chunk[ 'pdfBytes']

            image_path = convert_page_to_image(pdf_bytes, page_number)
            system_prompt = (
                "The user will provide you an image of a document file. Perform the following actions: "
                "1. Transcribe the text on the page. **TRANSCRIPTION OF THE TEXT:**"
                "2. If there is a chart, describe the image and include the text **DESCRIPTION OF THE IMAGE OR CHART**"
                "3. If there is a table, transcribe the table and include the text **TRANSCRIPTION OF THE TABLE**"
            )

            vision_response = get_vision_response(system_prompt, image_path)
            text = vision_response.choices[ 0].message.content

            page_data = {
                'PageNumber': page_number,
                'ImagePath': image_path,
                'PageText': text
            }
            page_data_list.append(page_data)

        pdf_df = pd.DataFrame(page_data_list)
        print ( "Document processing completed.")
        print ( "DataFrame created with page data.")

        return pdf_df
    except Exception as err:
        print ( f "Error processing document: { err} ")

df = process_document(document_to_parse)