awslabs · Jun 16, 2024
diff --git a/‎docs/astro.config.mjs
Lines changed: 16 additions & 10 deletions b/‎docs/astro.config.mjs
Lines changed: 16 additions & 10 deletions
diff --git a/‎docs/src/assets/icon-lancedb.png
24.1 KB b/‎docs/src/assets/icon-lancedb.png
24.1 KB
diff --git a/‎docs/src/assets/lancedb-storage-connector-efs-architecture.png
155 KB b/‎docs/src/assets/lancedb-storage-connector-efs-architecture.png
155 KB
diff --git a/‎docs/src/assets/lancedb-storage-connector-s3-architecture.png
156 KB b/‎docs/src/assets/lancedb-storage-connector-s3-architecture.png
156 KB
diff --git a/‎docs/src/content/docs/connectors/firehose-storage-connector.md renamed to ‎docs/src/content/docs/data-stores/firehose-storage-connector.md
Lines changed: 2 additions & 0 deletions b/‎docs/src/content/docs/connectors/firehose-storage-connector.md renamed to ‎docs/src/content/docs/data-stores/firehose-storage-connector.md
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/src/content/docs/connectors/neo4j-storage-connector.mdx renamed to ‎docs/src/content/docs/data-stores/neo4j-storage-connector.mdx
Lines changed: 2 additions & 0 deletions b/‎docs/src/content/docs/connectors/neo4j-storage-connector.mdx renamed to ‎docs/src/content/docs/data-stores/neo4j-storage-connector.mdx
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/src/content/docs/connectors/opensearch-storage-connector.mdx renamed to ‎docs/src/content/docs/data-stores/opensearch-storage-connector.mdx b/‎docs/src/content/docs/connectors/opensearch-storage-connector.mdx renamed to ‎docs/src/content/docs/data-stores/opensearch-storage-connector.mdx
diff --git a/‎docs/src/content/docs/connectors/s3-storage-connector.md renamed to ‎docs/src/content/docs/data-stores/s3-storage-connector.md b/‎docs/src/content/docs/connectors/s3-storage-connector.md renamed to ‎docs/src/content/docs/data-stores/s3-storage-connector.md
diff --git a/‎docs/src/content/docs/connectors/sqs-storage-connector.md renamed to ‎docs/src/content/docs/data-stores/sqs-storage-connector.md
Lines changed: 2 additions & 0 deletions b/‎docs/src/content/docs/connectors/sqs-storage-connector.md renamed to ‎docs/src/content/docs/data-stores/sqs-storage-connector.md
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/src/content/docs/generative-ai/ollama-processor.mdx
Lines changed: 2 additions & 0 deletions b/‎docs/src/content/docs/generative-ai/ollama-processor.mdx
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/src/content/docs/image-processing/sharp-image-transform.mdx
Lines changed: 2 additions & 0 deletions b/‎docs/src/content/docs/image-processing/sharp-image-transform.mdx
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/src/content/docs/vector-stores/lancedb-storage-connector.mdx
Lines changed: 218 additions & 0 deletions b/‎docs/src/content/docs/vector-stores/lancedb-storage-connector.mdx
Lines changed: 218 additions & 0 deletions
diff --git a/‎docs/src/content/docs/connectors/opensearch-vector-storage-connector.mdx renamed to ‎docs/src/content/docs/vector-stores/opensearch-vector-storage-connector.mdx b/‎docs/src/content/docs/connectors/opensearch-vector-storage-connector.mdx renamed to ‎docs/src/content/docs/vector-stores/opensearch-vector-storage-connector.mdx
diff --git a/‎docs/src/content/docs/connectors/pinecone-storage-connector.mdx renamed to ‎docs/src/content/docs/vector-stores/pinecone-storage-connector.mdx
Lines changed: 1 addition & 1 deletion b/‎docs/src/content/docs/connectors/pinecone-storage-connector.mdx renamed to ‎docs/src/content/docs/vector-stores/pinecone-storage-connector.mdx
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/simple-pipelines/embedding-pipelines/README.md
Lines changed: 17 additions & 0 deletions b/‎examples/simple-pipelines/embedding-pipelines/README.md
Lines changed: 17 additions & 0 deletions
diff --git a/‎examples/simple-pipelines/embedding-pipelines/bedrock-lancedb-pipeline/README.md
Lines changed: 6 additions & 2 deletions b/‎examples/simple-pipelines/embedding-pipelines/bedrock-lancedb-pipeline/README.md
Lines changed: 6 additions & 2 deletions
diff --git a/‎examples/simple-pipelines/embedding-pipelines/bedrock-lancedb-pipeline/package.json
Lines changed: 2 additions & 0 deletions b/‎examples/simple-pipelines/embedding-pipelines/bedrock-lancedb-pipeline/package.json
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/simple-pipelines/embedding-pipelines/bedrock-lancedb-pipeline/stack.ts
Lines changed: 31 additions & 5 deletions b/‎examples/simple-pipelines/embedding-pipelines/bedrock-lancedb-pipeline/stack.ts
Lines changed: 31 additions & 5 deletions
diff --git a/‎examples/simple-pipelines/embedding-pipelines/bedrock-opensearch-pipeline/README.md
Lines changed: 5 additions & 1 deletion b/‎examples/simple-pipelines/embedding-pipelines/bedrock-opensearch-pipeline/README.md
Lines changed: 5 additions & 1 deletion
diff --git a/‎examples/simple-pipelines/embedding-pipelines/bedrock-opensearch-pipeline/package.json
Lines changed: 2 additions & 0 deletions b/‎examples/simple-pipelines/embedding-pipelines/bedrock-opensearch-pipeline/package.json
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/simple-pipelines/embedding-pipelines/bedrock-opensearch-pipeline/stack.ts
Lines changed: 30 additions & 5 deletions b/‎examples/simple-pipelines/embedding-pipelines/bedrock-opensearch-pipeline/stack.ts
Lines changed: 30 additions & 5 deletions
diff --git a/‎packages/middlewares/storage-connectors/lancedb-storage-connector/src/definitions/opts.ts
Lines changed: 2 additions & 2 deletions b/‎packages/middlewares/storage-connectors/lancedb-storage-connector/src/definitions/opts.ts
Lines changed: 2 additions & 2 deletions
diff --git a/‎packages/middlewares/storage-connectors/lancedb-storage-connector/src/index.ts
Lines changed: 1 addition & 0 deletions b/‎packages/middlewares/storage-connectors/lancedb-storage-connector/src/index.ts
Lines changed: 1 addition & 0 deletions
@@ -155,28 +155,34 @@ export default defineConfig({
 					link: '/generative-ai/ollama-processor'
 				}]
 			}, {
-				label: 'Connectors',
+				label: 'Vector Stores',
 				items: [{
 					label: 'OpenSearch',
-					link: '/connectors/opensearch-storage-connector'
-				}, {
-					label: 'OpenSearch Vectors',
-					link: '/connectors/opensearch-vector-storage-connector'
+					link: '/vector-stores/opensearch-vector-storage-connector'
 				}, {
 					label: 'Pinecone',
-					link: '/connectors/pinecone-storage-connector'
+					link: '/vector-stores/pinecone-storage-connector'
+				}, {
+					label: 'LanceDB',
+					link: '/vector-stores/lancedb-storage-connector'
+				}]
+			}, {
+				label: 'Data Stores',
+				items: [{
+					label: 'OpenSearch',
+					link: '/data-stores/opensearch-storage-connector'
 				}, {
 					label: 'S3',
-					link: '/connectors/s3-storage-connector'
+					link: '/data-stores/s3-storage-connector'
 				}, {
 					label: 'SQS',
-					link: '/connectors/sqs-storage-connector'
+					link: '/data-stores/sqs-storage-connector'
 				}, {
 					label: 'Firehose',
-					link: '/connectors/firehose-storage-connector'
+					link: '/data-stores/firehose-storage-connector'
 				}, {
 					label: 'Neo4j',
-					link: '/connectors/neo4j-storage-connector'
+					link: '/data-stores/neo4j-storage-connector'
 				}]
 			}, {
 				label: 'Text Processing',
 
@@ -24,6 +24,8 @@ The Firehose storage connector makes it possible to forward [CloudEvents](/proje
 
 > 💁 This connector only forwards the CloudEvents emitted by middlewares to the delivery stream, and not the documents themselves.
 
+<br />
+
 ---
 
 ### ⏳ Buffering Documents
 
@@ -36,6 +36,8 @@ This connector is a very good choice for use-cases involving the storage of docu
 
 > 💁 The [Ontology](/project-lakechain/guides/ontology) documentation provides information about the standard structure of nodes and edges defined by Project Lakechain.
 
+<br />
+
 ---
 
 ### 🗄️ Indexing Documents
 
@@ -24,6 +24,8 @@ The SQS storage connector makes it possible to capture the result of one or mult
 
 > 💁 This connector only forwards the [CloudEvents](/project-lakechain/general/events) emitted by middlewares to the SQS queue, and not the documents themselves.
 
+<br />
+
 ---
 
 ### 🕒 Enqueue Documents
 
@@ -34,6 +34,8 @@ Using this middleware, customers can transform their text documents, as well as
 
 > 💁 You can view the list of models supported by Ollama [here](https://ollama.com/library).
 
+<br />
+
 ---
 
 ### 🦙 Running Ollama
 
@@ -36,6 +36,8 @@ The Sharp middleware can be used to apply transformations to images at scale, su
 
 <br />
 
+---
+
 ### 🖼️ Transforming Images
 
 To use this middleware, you import it in your CDK stack and declare the transforms you want to apply on images. Developers can use the same API as the native [Sharp API](https://sharp.pixelplumbing.com/) to declare the transforms to apply on images.
 
@@ -0,0 +1,218 @@
+---
+title: LanceDB
+---
+
+import { Image } from 'astro:assets';
+import icon from '../../../assets/icon-lancedb.png';
+
+<span title="Label: Pro" data-view-component="true" class="Label Label--api text-uppercase">
+  Unstable API
+</span>
+<span title="Label: Pro" data-view-component="true" class="Label Label--version text-uppercase">
+  0.7.0
+</span>
+<span title="Label: Pro" data-view-component="true" class="Label Label--package">
+  <a target="_blank" href="https://www.npmjs.com/package/@project-lakechain/lancedb-storage-connector">
+    @project-lakechain/lancedb-storage-connector
+  </a>
+</span>
+<span class="language-icon">
+  <svg role="img" viewBox="0 0 24 24" width="30" xmlns="http://www.w3.org/2000/svg" style="fill: #3178C6;"><title>TypeScript</title><path d="M1.125 0C.502 0 0 .502 0 1.125v21.75C0 23.498.502 24 1.125 24h21.75c.623 0 1.125-.502 1.125-1.125V1.125C24 .502 23.498 0 22.875 0zm17.363 9.75c.612 0 1.154.037 1.627.111a6.38 6.38 0 0 1 1.306.34v2.458a3.95 3.95 0 0 0-.643-.361 5.093 5.093 0 0 0-.717-.26 5.453 5.453 0 0 0-1.426-.2c-.3 0-.573.028-.819.086a2.1 2.1 0 0 0-.623.242c-.17.104-.3.229-.393.374a.888.888 0 0 0-.14.49c0 .196.053.373.156.529.104.156.252.304.443.444s.423.276.696.41c.273.135.582.274.926.416.47.197.892.407 1.266.628.374.222.695.473.963.753.268.279.472.598.614.957.142.359.214.776.214 1.253 0 .657-.125 1.21-.373 1.656a3.033 3.033 0 0 1-1.012 1.085 4.38 4.38 0 0 1-1.487.596c-.566.12-1.163.18-1.79.18a9.916 9.916 0 0 1-1.84-.164 5.544 5.544 0 0 1-1.512-.493v-2.63a5.033 5.033 0 0 0 3.237 1.2c.333 0 .624-.03.872-.09.249-.06.456-.144.623-.25.166-.108.29-.234.373-.38a1.023 1.023 0 0 0-.074-1.089 2.12 2.12 0 0 0-.537-.5 5.597 5.597 0 0 0-.807-.444 27.72 27.72 0 0 0-1.007-.436c-.918-.383-1.602-.852-2.053-1.405-.45-.553-.676-1.222-.676-2.005 0-.614.123-1.141.369-1.582.246-.441.58-.804 1.004-1.089a4.494 4.494 0 0 1 1.47-.629 7.536 7.536 0 0 1 1.77-.201zm-15.113.188h9.563v2.166H9.506v9.646H6.789v-9.646H3.375z"/></svg>
+</span>
+<span class="language-icon" style="margin-right: 10px">
+  <a target="_blank" href="https://lancedb.github.io/lancedb/">
+    <Image width="28" src={icon} alt="Icon" style="border-radius: 40%" />
+  </a>
+</span>
+<div style="margin-top: 26px"></div>
+
+---
+
+The LanceDB connector makes it possible for developers to leverage the embedded nature of [LanceDB](https://lancedb.github.io/lancedb/) databases to store document descriptions and their associated vector embeddings.
+This can be a particularly good choice for applications that don't require ultra-low latency for indexing and retrieval, and are not I/O sensitive.
+
+> 💁 By leveraging LanceDB as a vector store, developers can store 10's of thousands of vectors at a very low cost, benefiting from the serverless nature of LanceDB.
+
+<br />
+
+---
+
+### 💾 Indexing Documents
+
+To use the LanceDB storage connector, you import it in your CDK stack, and connect it to a data source providing document embeddings.
+You also define a storage provider such as S3 or EFS that will serve as the storage backend for the LanceDB database.
+
+> ℹ️ The below example showcases how to create a LanceDB connector leveraging the S3 storage provider.
+
+```typescript
+import { LanceDbStorageConnector, S3Storage } from '@project-lakechain/lancedb-storage-connector';
+import { CacheStorage } from '@project-lakechain/core';
+
+class Stack extends cdk.Stack {
+  constructor(scope: cdk.Construct, id: string) {
+    const cache = new CacheStorage(this, 'Cache');
+
+    // The bucket used to store the LanceDB database.
+    const bucket = new s3.Bucket(this, 'Bucket', {
+      encryption: s3.BucketEncryption.S3_MANAGED,
+      blockPublicAccess: s3.BlockPublicAccess.BLOCK_ALL
+    });
+
+    // Create the LanceDB storage connector.
+    const connector = new LanceDbStorageConnector.Builder()
+      .withScope(this)
+      .withIdentifier('LanceDbStorageConnector')
+      .withCacheStorage(cache)
+      .withSource(source)
+      .withVectorSize(1024)
+      .withStorageProvider(new S3Storage.Builder()
+        .withScope(this)
+        .withIdentifier('S3Storage')
+        .withBucket(bucket)
+        .build()
+      )
+      .build();
+  }
+}
+```
+
+<br />
+
+---
+
+### 🗃️ Storage Providers
+
+The LanceDB storage connector supports 2 different storage providers allowing you to balance the needs between cost, performance, durability and latency.
+
+#### S3 Storage
+
+The S3 storage provider uses an S3 bucket to store the LanceDB database using a standard storage class.
+
+> 💁 The provider does not create the S3 bucket, but uses a customer provided bucket, as well as an optional path prefix to store the database.
+
+<br />
+
+```typescript
+const connector = new LanceDbStorageConnector.Builder()
+  .withScope(this)
+  .withIdentifier('LanceDbStorageConnector')
+  .withCacheStorage(cache)
+  .withSource(source)
+  .withVectorSize(1024)
+  .withStorageProvider(new S3Storage.Builder()
+    .withScope(this)
+    .withIdentifier('S3Storage')
+    .withBucket(bucket) // 👈 Specify the S3 bucket
+    .build()
+  )
+  .build();
+```
+
+<br />
+
+---
+
+#### EFS Storage
+
+The EFS storage provider leverages AWS EFS to store the LanceDB database, providing lower latency and higher IOPS compared to S3.
+
+> 💁 The provider does not create the EFS file system, but uses a customer provided file system placed in a VPC, as well as an optional path prefix to store the database.
+
+<br />
+
+```typescript
+const connector = new LanceDbStorageConnector.Builder()
+  .withScope(this)
+  .withIdentifier('LanceDbStorageConnector')
+  .withCacheStorage(cache)
+  .withSource(source)
+  .withVectorSize(1024)
+  .withStorageProvider(new EfsStorage.Builder()
+    .withScope(this)
+    .withIdentifier('EfsStorage')
+    .withFileSystem(fileSystem) // 👈 Specify the EFS
+    .withVpc(vpc) // 👈 Specify the EFS VPC
+    .build()
+  )
+  .build();
+```
+
+<br />
+
+---
+
+#### Include Text
+
+When the document being processed is a text document, you can choose to include the text of the document associated with the embeddings in the LanceDB table.
+This allows you to retrieve the text associated with the embeddings when executing a similarity search without having to retrieve the original text from a separate database.
+
+To do so, you can use the `withIncludeText` API. If the document is not a text, this option is ignored.
+
+> 💁 By default, the text is not included in the index.
+
+```typescript
+const connector = new LanceDbStorageConnector.Builder()
+  .withScope(this)
+  .withIdentifier('LanceDbStorageConnector')
+  .withCacheStorage(cache)
+  .withSource(source)
+  .withVectorSize(1024)
+  .withStorageProvider(storageProvider)
+  .withIncludeText(true) // 👈 Include text
+  .build();
+```
+
+<br />
+
+---
+
+### 🏗️ Architecture
+
+The architecture implemented by the LanceDB storage connector is based on a Lambda ARM64 compute to index document embeddings provided by source middlewares into the LanceDB database.
+The connector uses an AWS Lambda Layer to include the LanceDB library within the Lambda environment.
+
+> 💁 The architecture depends on the selected storage provider. Below is a description of the architecture for each storage provider.
+
+#### S3 Storage Provider
+
+The S3 storage provider uses a user provided S3 bucket to store the LanceDB database.
+
+![LanceDB Storage Connector S3 Architecture](../../../assets/lancedb-storage-connector-s3-architecture.png)
+
+#### EFS Storage Provider
+
+The EFS storage provider uses a user provided EFS file system to store the LanceDB database.
+
+![LanceDB Storage Connector EFS Architecture](../../../assets/lancedb-storage-connector-efs-architecture.png)
+
+<br />
+
+---
+
+### 🏷️ Properties
+
+<br />
+
+##### Supported Inputs
+
+|  Mime Type  | Description |
+| ----------- | ----------- |
+| `*/*` | This middleware supports any type of documents. Note that if no embeddings are specified in the document metadata, the document is filtered out. |
+
+##### Supported Outputs
+
+*This middleware does not produce any output.*
+
+##### Supported Compute Types
+
+| Type  | Description |
+| ----- | ----------- |
+| `CPU` | This middleware only supports CPU compute. |
+
+<br />
+
+---
+
+### 📖 Examples
+
+- [Bedrock + LanceDB](https://github.com/awslabs/project-lakechain/tree/main/examples/simple-pipelines/embedding-pipelines/bedrock-lancedb-pipeline) - An example showcasing an embedding pipeline using Amazon Bedrock and LanceDB.
@@ -94,7 +94,7 @@ const connector = new PineconeStorageConnector.Builder()
 
 #### Include Text
 
-When the document being processed is a text, you can choose to include the text of the document associated with the embeddings in the Pinecone index. To do so, you can use the `withIncludeText` API. If the document is not a text, this option is ignored.
+When the document being processed is a text document, you can choose to include the text of the document associated with the embeddings in the Pinecone index. To do so, you can use the `withIncludeText` API. If the document is not a text, this option is ignored.
 
 > 💁 By default, the text is not included in the index.
 
 
@@ -0,0 +1,17 @@
+# 🤖 Embedding Pipelines
+
+In this directory we provide several examples of embedding pipelines that showcase how to create vector embeddings for text documents using different embedding models and vector stores on AWS using Project Lakechain.
+
+## 🌟 Examples
+
+Below is a list of the different examples available in this directory.
+
+Pipeline | Description | Model | Vector Store
+--- | --- | --- | ---
+[Bedrock + LanceDB](bedrock-lancedb-pipeline) | Generate embeddings for text documents using the [Amazon Titan](https://docs.aws.amazon.com/bedrock/latest/userguide/titan-embedding-models.html) text embedding model and store them in a [LanceDB](https://www.pinecone.io/) database. | Amazon Titan Embeddings | LanceDB
+[Bedrock + OpenSearch](bedrock-opensearch-pipeline) | Generate embeddings for text documents using the [Amazon Titan](https://docs.aws.amazon.com/bedrock/latest/userguide/titan-embedding-models.html) text embedding model and store them in an [OpenSearch](https://opensearch.org/) index. | Amazon Titan Embeddings | Amazon OpenSearch
+[Bedrock + Pinecone](bedrock-pinecone-pipeline) | Generate embeddings for text documents using the [Amazon Titan](https://docs.aws.amazon.com/bedrock/latest/userguide/titan-embedding-models.html) text embedding model and store them in a [Pinecone](https://www.pinecone.io/) vector store. | Amazon Titan Embeddings | Pinecone
+[CLIP + S3](clip-embeddings-pipeline) | Generate embeddings for images using the [OpenAI CLIP](https://openai.com/research/clip/) embedding model and store them in an S3 bucket. | OpenAI CLIP | None
+[Cohere Embeddings + OpenSearch](cohere-opensearch-pipeline) | Generate embeddings for text documents using the [Cohere](https://cohere.ai/) embedding model and store them in an [OpenSearch](https://opensearch.org/) index. | Cohere on Bedrock | Amazon OpenSearch
+[PANNS + OpenSearch](panns-opensearch-pipeline) | Generate embeddings for audio using the [PANNS Inference Model](https://github.com/qiuqiangkong/panns_inference) and store them in an [OpenSearch](https://opensearch.org/) index. | PANNS | Amazon OpenSearch
+[Sentence Transformers + OpenSearch](sentence-transformers-pipeline) | Generate embeddings for text documents using the [Sentence Transformers](https://www.sbert.net/) embedding model and store them in an [OpenSearch](https://opensearch.org/) index. | Sentence Transformers | Amazon OpenSearch
@@ -1,18 +1,22 @@
 # 💾 Bedrock + LanceDB Pipeline
 
-> In this example, we showcase how to create vector embeddings for text documents using the [Amazon Bedrock](https://aws.amazon.com/bedrock/) Titan embedding model. The embeddings are stored within a [LanceDB](https://www.pinecone.io/) embedded database that you can query using your own applications.
+> In this example, we showcase how to create vector embeddings for text documents (Plain Text, PDF, Office Documents) using the [Amazon Bedrock](https://aws.amazon.com/bedrock/) Titan embedding model. The embeddings are stored within a [LanceDB](https://www.pinecone.io/) embedded database that you can query using your own applications.
 
 ## :dna: Pipeline
 
 ```mermaid
 flowchart LR
   Input([Input Bucket]) -.-> S3[S3 Trigger]
   S3 --> TextSplitter[Text Splitter]
+  S3 --> PDF[PDF Text Converter]
+  S3 --> Pandoc[Pandoc Text Converter]
+  PDF --> TextSplitter
+  Pandoc --> TextSplitter
   TextSplitter --> Bedrock[Bedrock Embedding Processor]
   Bedrock --> LanceDB[LanceDB Storage]
 ```
 
-In this pipeline we are generating embeddings for text documents and leveraging the EFS storage provider with the LanceDB connector to store embeddings. The use of EFS makes it a good balance between cost and latency provided for storage and retrieval of documents based on their vector embeddings.
+In this pipeline we are generating embeddings for text documents, including plain text files, PDFs, or Microsoft Office documents, and leverage the EFS storage provider with the LanceDB connector to store those generated embeddings into an embedded LanceDB database. The use of EFS makes it a good balance between cost and latency provided for storage and retrieval of documents based on their vector embeddings.
 
 > 💁 It is also possible to use the S3 storage provider with the LanceDB connector to store embeddings on S3 for an even lower cost, but at a much higher latency.
 
 
@@ -30,6 +30,8 @@
     "ts-node": "^10.9.2"
   },
   "dependencies": {
+    "@project-lakechain/pandoc-text-converter": "*",
+    "@project-lakechain/pdf-text-converter": "*",
     "@project-lakechain/bedrock-embedding-processors": "*",
     "@project-lakechain/lancedb-storage-connector": "*",
     "@project-lakechain/recursive-character-text-splitter": "*",
 
@@ -26,6 +26,8 @@ import { CacheStorage } from '@project-lakechain/core';
 import { S3EventTrigger } from '@project-lakechain/s3-event-trigger';
 import { RecursiveCharacterTextSplitter } from '@project-lakechain/recursive-character-text-splitter';
 import { TitanEmbeddingProcessor } from '@project-lakechain/bedrock-embedding-processors';
+import { PdfTextConverter } from '@project-lakechain/pdf-text-converter';
+import { PandocTextConverter } from '@project-lakechain/pandoc-text-converter';
 import { LanceDbStorageConnector, EfsStorage } from '@project-lakechain/lancedb-storage-connector';
 
 /**
@@ -34,9 +36,13 @@ import { LanceDbStorageConnector, EfsStorage } from '@project-lakechain/lancedb-
  * The pipeline looks as follows:
  *
  *
- * ┌──────┐   ┌───────────────┐   ┌────────────────────┐   ┌───────────┐
- * │  S3  ├──►│ Text Splitter ├──►│ Bedrock Embeddings │──►|  LanceDB  │
- * └──────┘   └───────────────┘   └────────────────────┘   └───────────┘
+ *                   ┌──────────────────────┐
+ *    ┌─────────────►│  PDF Text Converter  ├──────────┐
+ *    │              └──────────────────────┘          |
+ *    |                                                ▼
+ * ┌──────────────┐   ┌────────────────────┐   ┌───────────────┐   ┌───────────┐   ┌───────────┐
+ * │   S3 Input   ├──►│  Pandoc Converter  ├──►│ Text Splitter ├──►│  Bedrock  ├──►|  LanceDB  │
+ * └──────────────┘   └────────────────────┘   └───────────────┘   └───────────┘   └───────────┘
  *
  */
 export class BedrockLanceDbPipeline extends cdk.Stack {
@@ -76,7 +82,7 @@ export class BedrockLanceDbPipeline extends cdk.Stack {
         subnetType: ec2.SubnetType.PRIVATE_ISOLATED
       }
     });
-    
+
     // The cache storage.
     const cache = new CacheStorage(this, 'CacheStorage', {});
 
@@ -92,14 +98,34 @@ export class BedrockLanceDbPipeline extends cdk.Stack {
       .withBucket(source)
       .build();
 
+    // Convert PDF documents to text.
+    const pdfConverter = new PdfTextConverter.Builder()
+      .withScope(this)
+      .withIdentifier('PdfConverter')
+      .withCacheStorage(cache)
+      .withSource(trigger)
+      .build();
+
+    // Convert text-oriented documents (Docx, Markdown, HTML, etc) to text.
+    const pandocConverter = new PandocTextConverter.Builder()
+      .withScope(this)
+      .withIdentifier('PandocConverter')
+      .withCacheStorage(cache)
+      .withSource(trigger)
+      .build();
+
     // We use the `RecursiveCharacterTextSplitter` to split
     // input text into smaller chunks. This is required to ensure
     // that the generated embeddings are relevant.
     const textSplitter = new RecursiveCharacterTextSplitter.Builder()
       .withScope(this)
       .withIdentifier('RecursiveCharacterTextSplitter')
       .withCacheStorage(cache)
-      .withSource(trigger)
+      .withSources([
+        pdfConverter,
+        pandocConverter,
+        trigger
+      ])
       .withChunkSize(4096)
       .build();
 
 
@@ -1,13 +1,17 @@
 # 🤖 Bedrock OpenSearch Pipeline
 
-> In this example, we showcase how to create vector embeddings for text documents using the [Amazon Bedrock](https://aws.amazon.com/bedrock/) Titan embedding model. The embeddings are stored within an Amazon OpenSearch index automatically that you can query using your own applications.
+> In this example, we showcase how to create vector embeddings for text documents (Plain Text, PDF, Office Documents) using the [Amazon Bedrock](https://aws.amazon.com/bedrock/) Titan embedding model. The embeddings are stored within an Amazon OpenSearch index automatically that you can query using your own applications.
 
 ## :dna: Pipeline
 
 ```mermaid
 flowchart LR
   Input([Input Bucket]) -.-> S3[S3 Trigger]
   S3 --> TextSplitter[Text Splitter]
+  S3 --> PDF[PDF Text Converter]
+  S3 --> Pandoc[Pandoc Text Converter]
+  PDF --> TextSplitter
+  Pandoc --> TextSplitter
   TextSplitter --> Bedrock[Bedrock Embedding Processor]
   Bedrock --> OpenSearch[OpenSearch Vector Storage]
 ```
 
@@ -30,6 +30,8 @@
     "ts-node": "^10.9.2"
   },
   "dependencies": {
+    "@project-lakechain/pandoc-text-converter": "*",
+    "@project-lakechain/pdf-text-converter": "*",
     "@project-lakechain/bedrock-embedding-processors": "*",
     "@project-lakechain/opensearch-domain": "*",
     "@project-lakechain/opensearch-vector-storage-connector": "*",
 
@@ -26,17 +26,22 @@ import { S3EventTrigger } from '@project-lakechain/s3-event-trigger';
 import { RecursiveCharacterTextSplitter } from '@project-lakechain/recursive-character-text-splitter';
 import { OpenSearchVectorStorageConnector, OpenSearchVectorIndexDefinition } from '@project-lakechain/opensearch-vector-storage-connector';
 import { TitanEmbeddingProcessor, TitanEmbeddingModel } from '@project-lakechain/bedrock-embedding-processors';
+import { PdfTextConverter } from '@project-lakechain/pdf-text-converter';
+import { PandocTextConverter } from '@project-lakechain/pandoc-text-converter';
 import { OpenSearchDomain } from '@project-lakechain/opensearch-domain';
 
 /**
  * An example stack showcasing how to use Amazon Bedrock embeddings
  * and OpenSearch for storing embeddings.
  * The pipeline looks as follows:
  *
- *
- * ┌──────┐   ┌───────────────┐   ┌────────────────────┐   ┌──────────────┐
- * │  S3  ├──►│ Text Splitter ├──►│ Bedrock Embeddings │──►|  OpenSearch  │
- * └──────┘   └───────────────┘   └────────────────────┘   └──────────────┘
+ *                   ┌──────────────────────┐
+ *    ┌─────────────►│  PDF Text Converter  ├──────────┐
+ *    │              └──────────────────────┘          |
+ *    |                                                ▼
+ * ┌──────────────┐   ┌────────────────────┐   ┌───────────────┐   ┌───────────┐   ┌──────────────┐
+ * │   S3 Input   ├──►│  Pandoc Converter  ├──►│ Text Splitter ├──►│  Bedrock  ├──►|  OpenSearch  │
+ * └──────────────┘   └────────────────────┘   └───────────────┘   └───────────┘   └──────────────┘
  *
  */
 export class BedrockEmbeddingPipeline extends cdk.Stack {
@@ -86,14 +91,34 @@ export class BedrockEmbeddingPipeline extends cdk.Stack {
       .withBucket(source)
       .build();
 
+    // Convert PDF documents to text.
+    const pdfConverter = new PdfTextConverter.Builder()
+      .withScope(this)
+      .withIdentifier('PdfConverter')
+      .withCacheStorage(cache)
+      .withSource(trigger)
+      .build();
+
+    // Convert text-oriented documents (Docx, Markdown, HTML, etc) to text.
+    const pandocConverter = new PandocTextConverter.Builder()
+      .withScope(this)
+      .withIdentifier('PandocConverter')
+      .withCacheStorage(cache)
+      .withSource(trigger)
+      .build();
+
     // We use the `RecursiveCharacterTextSplitter` to split
     // input text into smaller chunks. This is required to ensure
     // that the generated embeddings are relevant.
     const textSplitter = new RecursiveCharacterTextSplitter.Builder()
       .withScope(this)
       .withIdentifier('RecursiveCharacterTextSplitter')
       .withCacheStorage(cache)
-      .withSource(trigger)
+      .withSources([
+        pdfConverter,
+        pandocConverter,
+        trigger
+      ])
       .withChunkSize(4096)
       .build();
 
 
@@ -47,11 +47,11 @@ export const LanceDbStorageConnectorPropsSchema = MiddlewarePropsSchema.extend({
   /**
    * Whether to include the text associated with the
    * embeddings in LanceDB.
-   * @default true
+   * @default false
    */
   includeText: z
     .boolean()
-    .default(true)
+    .default(false)
 });
 
 // The type of the LanceDbStorageConnectorProps schema.
 
@@ -105,6 +105,7 @@ class LanceDbStorageConnectorBuilder extends MiddlewareBuilder {
    * with the embeddings in LanceDB.
    * @param includeText whether to include the text
    * associated with the embeddings in LanceDB.
+   * @default false
    * @returns the builder instance.
    */
   public withIncludeText(includeText: boolean) {