Skip to content

Converting MS Office Files to PDFs for RAG

Published: at 07:45 PM

For those building Retrieval Augmented Generation (RAG) applications, you’ll probably start with PDFs but very quickly need to include other file types, particularly Microsoft Office files like docx, pptx, and xlsx. Ideally you’d build separate parsers and pipelines for each file type to take advantage of the rich data and structure encoded within each file type, but converting these to PDFs can be a quick way to incorporate them immediately into an application that already handles PDFs. Once converted, you can do things like display them in your application’s PDF viewer and add them to your vector database without needing to write a bunch of additional code.

If the data already sits in MS Sharepoint or OneDrive, one of the easiest ways to handle conversion that doesn’t require any additional infrastructure is to use the MS Graph API.

  1. Get an Access Token

Register an App in Azure App Registration Portal and get the tenant_id, client_id, and client_secret. Make sure you give yourself the necessary api permissions, like the following:

Once you’ve stored the above ids and secrets in a secure location, you can use the following function to get an access token:

import httpx

def get_access_token(tenant_id: str, client_id: str, client_secret: str) -> str:
    authority = (
        f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
    )
    payload = {
        "grant_type": "client_credentials",
        "client_id": client_id,
        "client_secret": client_secret,
        "resource": "https://graph.microsoft.com/",
    }
    response = httpx.post(
        url=authority,
        data=payload,
    )
    response.raise_for_status()
    return response.json()["access_token"]

Note that the token expires, so you’ll need to refresh it periodically. You can get the "expires_on" timestamp from the token payload.

  1. Get the Drive ID Endpoint and Drive ID

Note that file_url here is the actual https url of the file in your browser.

from urllib.parse import urlparse

def get_site_id_with_host_name(sharepoint_site_name: str, access_token: str) -> str:
    site_information_endpoint = f"https://graph.microsoft.com/v1.0/sites?search={sharepoint_site_name}"

    response = httpx.get(
        url=site_information_endpoint,
        headers={
            "Authorization": f"Bearer {access_token}",
        },
    )
    response.raise_for_status()
    if (
        len(response.json()["value"]) > 0
        and "id" in response.json()["value"][0]
    ):
        site_id_with_host_name = response.json()["value"][0]["id"]
        return site_id_with_host_name
    else:
        raise ValueError(
            f"The specified sharepoint site {sharepoint_site_name} is not found."
        )

def get_drive_id(drive_id_endpoint: str, access_token: str) -> str:
    response = httpx.get(
        url=drive_id_endpoint,
        headers={
            "Authorization": f"Bearer {access_token}",
        },
    )

    response.raise_for_status()
    if (
        len(response.json()["value"]) > 0
        and "id" in response.json()["value"][0]
    ):
        drive_id = response.json()["value"][0]["id"]
        return drive_id
    else:
        raise ValueError(
            "Error occurred while fetching the drives for the sharepoint site."
        )

def get_sharepoint_site_name(file_url: str) -> str:
    parsed_url = urlparse(file_url)
    path_parts = parsed_url.path.split('/')
    sites_index = path_parts.index('sites')
    sharepoint_name = path_parts[sites_index + 1]
    return f"{sharepoint_name}/"

def get_drive_id_endpoint_and_drive_id(
    file_url: str,
    access_token: str,
) -> tuple[str, str]:
    sharepoint_site_name = get_sharepoint_site_name(file_url)
    site_id_with_host_name = get_site_id_with_host_name(
        sharepoint_site_name,
        access_token,
    )
    drive_id_endpoint = f"https://graph.microsoft.com/v1.0/sites/{site_id_with_host_name}/drives"
    drive_id = get_drive_id(drive_id_endpoint, access_token)
    return drive_id_endpoint, drive_id
  1. Get the Item ID

Note that file_url here is the actual https url of the file in your browser.

import re
from urllib.parse import parse_qs

def get_sharepoint_guid(file_url: str) -> str:
    parsed_url = urlparse(file_url)
    query_params = parse_qs(parsed_url.query)

    # Get the sourcedoc GUID and remove curly braces
    sourcedoc = query_params.get('sourcedoc', [''])[0]
    guid = re.sub(r'[{}]', '', sourcedoc)
    return guid


def get_item_id(
    drive_id_endpoint: str,
    drive_id: str,
    file_url: str,
    access_token: str,
) -> str:
    guid = get_sharepoint_guid(file_url)

    response = httpx.get(
        url=f"{drive_id_endpoint}/{drive_id}/items/{guid}",
        headers={
            "Authorization": f"Bearer {access_token}",
        },
    )
    response.raise_for_status()
    return response.json()['id']
  1. Get the PDF
def get_pdf(
    drive_id_endpoint: str,
    drive_id: str,
    item_id: str,
    access_token: str,
) -> bytes:
    conversion_endpoint = f"{drive_id_endpoint}/{drive_id}/items/{item_id}/content?format=pdf"

    redirect_response = httpx.get(
        url=conversion_endpoint,
        headers={
            "Authorization": f"Bearer {access_token}",
        },
    )
    response.raise_for_status()
    if redirect_response.status_code == 302:
        download_url = redirect_response.headers["Location"]
        response = httpx.get(
            url=download_url,
            headers={
                "Authorization": f"Bearer {access_token}",
            },
        )
        response.raise_for_status()
        return response.content
    else:
        raise ValueError(
            f"Error occurred while fetching the pdf for the file {item_id}, received code {redirect_response.status_code} and response {redirect_response.text}"
        )

Putting the above together, you get something like this:

import os

file_url = "..."
access_token = get_access_token(os.environ["TENANT_ID"], os.environ["CLIENT_ID"], os.environ["CLIENT_SECRET"])
drive_id_endpoint, drive_id = get_drive_id_endpoint_and_drive_id(file_url, access_token)
item_id = get_item_id(drive_id_endpoint, drive_id, file_url, access_token)
pdf_bytes = get_pdf(drive_id_endpoint, drive_id, item_id, access_token)