Skip to content

Commit e652450

Browse files
committedJun 16, 2024··
docs: add lancedb docs
1 parent 8f0b5e3 commit e652450

23 files changed

+341
-26
lines changed
 

‎docs/astro.config.mjs

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -155,28 +155,34 @@ export default defineConfig({
155155
link: '/generative-ai/ollama-processor'
156156
}]
157157
}, {
158-
label: 'Connectors',
158+
label: 'Vector Stores',
159159
items: [{
160160
label: 'OpenSearch',
161-
link: '/connectors/opensearch-storage-connector'
162-
}, {
163-
label: 'OpenSearch Vectors',
164-
link: '/connectors/opensearch-vector-storage-connector'
161+
link: '/vector-stores/opensearch-vector-storage-connector'
165162
}, {
166163
label: 'Pinecone',
167-
link: '/connectors/pinecone-storage-connector'
164+
link: '/vector-stores/pinecone-storage-connector'
165+
}, {
166+
label: 'LanceDB',
167+
link: '/vector-stores/lancedb-storage-connector'
168+
}]
169+
}, {
170+
label: 'Data Stores',
171+
items: [{
172+
label: 'OpenSearch',
173+
link: '/data-stores/opensearch-storage-connector'
168174
}, {
169175
label: 'S3',
170-
link: '/connectors/s3-storage-connector'
176+
link: '/data-stores/s3-storage-connector'
171177
}, {
172178
label: 'SQS',
173-
link: '/connectors/sqs-storage-connector'
179+
link: '/data-stores/sqs-storage-connector'
174180
}, {
175181
label: 'Firehose',
176-
link: '/connectors/firehose-storage-connector'
182+
link: '/data-stores/firehose-storage-connector'
177183
}, {
178184
label: 'Neo4j',
179-
link: '/connectors/neo4j-storage-connector'
185+
link: '/data-stores/neo4j-storage-connector'
180186
}]
181187
}, {
182188
label: 'Text Processing',

‎docs/src/assets/icon-lancedb.png

24.1 KB
Loading
Loading
Loading

‎docs/src/content/docs/connectors/firehose-storage-connector.md renamed to ‎docs/src/content/docs/data-stores/firehose-storage-connector.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ The Firehose storage connector makes it possible to forward [CloudEvents](/proje
2424

2525
> 💁 This connector only forwards the CloudEvents emitted by middlewares to the delivery stream, and not the documents themselves.
2626
27+
<br />
28+
2729
---
2830

2931
### ⏳ Buffering Documents

‎docs/src/content/docs/connectors/neo4j-storage-connector.mdx renamed to ‎docs/src/content/docs/data-stores/neo4j-storage-connector.mdx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ This connector is a very good choice for use-cases involving the storage of docu
3636

3737
> 💁 The [Ontology](/project-lakechain/guides/ontology) documentation provides information about the standard structure of nodes and edges defined by Project Lakechain.
3838
39+
<br />
40+
3941
---
4042

4143
### 🗄️ Indexing Documents

‎docs/src/content/docs/connectors/sqs-storage-connector.md renamed to ‎docs/src/content/docs/data-stores/sqs-storage-connector.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ The SQS storage connector makes it possible to capture the result of one or mult
2424

2525
> 💁 This connector only forwards the [CloudEvents](/project-lakechain/general/events) emitted by middlewares to the SQS queue, and not the documents themselves.
2626
27+
<br />
28+
2729
---
2830

2931
### 🕒 Enqueue Documents

‎docs/src/content/docs/generative-ai/ollama-processor.mdx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ Using this middleware, customers can transform their text documents, as well as
3434

3535
> 💁 You can view the list of models supported by Ollama [here](https://ollama.com/library).
3636
37+
<br />
38+
3739
---
3840

3941
### 🦙 Running Ollama

‎docs/src/content/docs/image-processing/sharp-image-transform.mdx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ The Sharp middleware can be used to apply transformations to images at scale, su
3636

3737
<br />
3838

39+
---
40+
3941
### 🖼️ Transforming Images
4042

4143
To use this middleware, you import it in your CDK stack and declare the transforms you want to apply on images. Developers can use the same API as the native [Sharp API](https://sharp.pixelplumbing.com/) to declare the transforms to apply on images.
Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
---
2+
title: LanceDB
3+
---
4+
5+
import { Image } from 'astro:assets';
6+
import icon from '../../../assets/icon-lancedb.png';
7+
8+
<span title="Label: Pro" data-view-component="true" class="Label Label--api text-uppercase">
9+
Unstable API
10+
</span>
11+
<span title="Label: Pro" data-view-component="true" class="Label Label--version text-uppercase">
12+
0.7.0
13+
</span>
14+
<span title="Label: Pro" data-view-component="true" class="Label Label--package">
15+
<a target="_blank" href="https://www.npmjs.com/package/@project-lakechain/lancedb-storage-connector">
16+
@project-lakechain/lancedb-storage-connector
17+
</a>
18+
</span>
19+
<span class="language-icon">
20+
<svg role="img" viewBox="0 0 24 24" width="30" xmlns="http://www.w3.org/2000/svg" style="fill: #3178C6;"><title>TypeScript</title><path d="M1.125 0C.502 0 0 .502 0 1.125v21.75C0 23.498.502 24 1.125 24h21.75c.623 0 1.125-.502 1.125-1.125V1.125C24 .502 23.498 0 22.875 0zm17.363 9.75c.612 0 1.154.037 1.627.111a6.38 6.38 0 0 1 1.306.34v2.458a3.95 3.95 0 0 0-.643-.361 5.093 5.093 0 0 0-.717-.26 5.453 5.453 0 0 0-1.426-.2c-.3 0-.573.028-.819.086a2.1 2.1 0 0 0-.623.242c-.17.104-.3.229-.393.374a.888.888 0 0 0-.14.49c0 .196.053.373.156.529.104.156.252.304.443.444s.423.276.696.41c.273.135.582.274.926.416.47.197.892.407 1.266.628.374.222.695.473.963.753.268.279.472.598.614.957.142.359.214.776.214 1.253 0 .657-.125 1.21-.373 1.656a3.033 3.033 0 0 1-1.012 1.085 4.38 4.38 0 0 1-1.487.596c-.566.12-1.163.18-1.79.18a9.916 9.916 0 0 1-1.84-.164 5.544 5.544 0 0 1-1.512-.493v-2.63a5.033 5.033 0 0 0 3.237 1.2c.333 0 .624-.03.872-.09.249-.06.456-.144.623-.25.166-.108.29-.234.373-.38a1.023 1.023 0 0 0-.074-1.089 2.12 2.12 0 0 0-.537-.5 5.597 5.597 0 0 0-.807-.444 27.72 27.72 0 0 0-1.007-.436c-.918-.383-1.602-.852-2.053-1.405-.45-.553-.676-1.222-.676-2.005 0-.614.123-1.141.369-1.582.246-.441.58-.804 1.004-1.089a4.494 4.494 0 0 1 1.47-.629 7.536 7.536 0 0 1 1.77-.201zm-15.113.188h9.563v2.166H9.506v9.646H6.789v-9.646H3.375z"/></svg>
21+
</span>
22+
<span class="language-icon" style="margin-right: 10px">
23+
<a target="_blank" href="https://lancedb.github.io/lancedb/">
24+
<Image width="28" src={icon} alt="Icon" style="border-radius: 40%" />
25+
</a>
26+
</span>
27+
<div style="margin-top: 26px"></div>
28+
29+
---
30+
31+
The LanceDB connector makes it possible for developers to leverage the embedded nature of [LanceDB](https://lancedb.github.io/lancedb/) databases to store document descriptions and their associated vector embeddings.
32+
This can be a particularly good choice for applications that don't require ultra-low latency for indexing and retrieval, and are not I/O sensitive.
33+
34+
> 💁 By leveraging LanceDB as a vector store, developers can store 10's of thousands of vectors at a very low cost, benefiting from the serverless nature of LanceDB.
35+
36+
<br />
37+
38+
---
39+
40+
### 💾 Indexing Documents
41+
42+
To use the LanceDB storage connector, you import it in your CDK stack, and connect it to a data source providing document embeddings.
43+
You also define a storage provider such as S3 or EFS that will serve as the storage backend for the LanceDB database.
44+
45+
> ℹ️ The below example showcases how to create a LanceDB connector leveraging the S3 storage provider.
46+
47+
```typescript
48+
import { LanceDbStorageConnector, S3Storage } from '@project-lakechain/lancedb-storage-connector';
49+
import { CacheStorage } from '@project-lakechain/core';
50+
51+
class Stack extends cdk.Stack {
52+
constructor(scope: cdk.Construct, id: string) {
53+
const cache = new CacheStorage(this, 'Cache');
54+
55+
// The bucket used to store the LanceDB database.
56+
const bucket = new s3.Bucket(this, 'Bucket', {
57+
encryption: s3.BucketEncryption.S3_MANAGED,
58+
blockPublicAccess: s3.BlockPublicAccess.BLOCK_ALL
59+
});
60+
61+
// Create the LanceDB storage connector.
62+
const connector = new LanceDbStorageConnector.Builder()
63+
.withScope(this)
64+
.withIdentifier('LanceDbStorageConnector')
65+
.withCacheStorage(cache)
66+
.withSource(source)
67+
.withVectorSize(1024)
68+
.withStorageProvider(new S3Storage.Builder()
69+
.withScope(this)
70+
.withIdentifier('S3Storage')
71+
.withBucket(bucket)
72+
.build()
73+
)
74+
.build();
75+
}
76+
}
77+
```
78+
79+
<br />
80+
81+
---
82+
83+
### 🗃️ Storage Providers
84+
85+
The LanceDB storage connector supports 2 different storage providers allowing you to balance the needs between cost, performance, durability and latency.
86+
87+
#### S3 Storage
88+
89+
The S3 storage provider uses an S3 bucket to store the LanceDB database using a standard storage class.
90+
91+
> 💁 The provider does not create the S3 bucket, but uses a customer provided bucket, as well as an optional path prefix to store the database.
92+
93+
<br />
94+
95+
```typescript
96+
const connector = new LanceDbStorageConnector.Builder()
97+
.withScope(this)
98+
.withIdentifier('LanceDbStorageConnector')
99+
.withCacheStorage(cache)
100+
.withSource(source)
101+
.withVectorSize(1024)
102+
.withStorageProvider(new S3Storage.Builder()
103+
.withScope(this)
104+
.withIdentifier('S3Storage')
105+
.withBucket(bucket) // 👈 Specify the S3 bucket
106+
.build()
107+
)
108+
.build();
109+
```
110+
111+
<br />
112+
113+
---
114+
115+
#### EFS Storage
116+
117+
The EFS storage provider leverages AWS EFS to store the LanceDB database, providing lower latency and higher IOPS compared to S3.
118+
119+
> 💁 The provider does not create the EFS file system, but uses a customer provided file system placed in a VPC, as well as an optional path prefix to store the database.
120+
121+
<br />
122+
123+
```typescript
124+
const connector = new LanceDbStorageConnector.Builder()
125+
.withScope(this)
126+
.withIdentifier('LanceDbStorageConnector')
127+
.withCacheStorage(cache)
128+
.withSource(source)
129+
.withVectorSize(1024)
130+
.withStorageProvider(new EfsStorage.Builder()
131+
.withScope(this)
132+
.withIdentifier('EfsStorage')
133+
.withFileSystem(fileSystem) // 👈 Specify the EFS
134+
.withVpc(vpc) // 👈 Specify the EFS VPC
135+
.build()
136+
)
137+
.build();
138+
```
139+
140+
<br />
141+
142+
---
143+
144+
#### Include Text
145+
146+
When the document being processed is a text document, you can choose to include the text of the document associated with the embeddings in the LanceDB table.
147+
This allows you to retrieve the text associated with the embeddings when executing a similarity search without having to retrieve the original text from a separate database.
148+
149+
To do so, you can use the `withIncludeText` API. If the document is not a text, this option is ignored.
150+
151+
> 💁 By default, the text is not included in the index.
152+
153+
```typescript
154+
const connector = new LanceDbStorageConnector.Builder()
155+
.withScope(this)
156+
.withIdentifier('LanceDbStorageConnector')
157+
.withCacheStorage(cache)
158+
.withSource(source)
159+
.withVectorSize(1024)
160+
.withStorageProvider(storageProvider)
161+
.withIncludeText(true) // 👈 Include text
162+
.build();
163+
```
164+
165+
<br />
166+
167+
---
168+
169+
### 🏗️ Architecture
170+
171+
The architecture implemented by the LanceDB storage connector is based on a Lambda ARM64 compute to index document embeddings provided by source middlewares into the LanceDB database.
172+
The connector uses an AWS Lambda Layer to include the LanceDB library within the Lambda environment.
173+
174+
> 💁 The architecture depends on the selected storage provider. Below is a description of the architecture for each storage provider.
175+
176+
#### S3 Storage Provider
177+
178+
The S3 storage provider uses a user provided S3 bucket to store the LanceDB database.
179+
180+
![LanceDB Storage Connector S3 Architecture](../../../assets/lancedb-storage-connector-s3-architecture.png)
181+
182+
#### EFS Storage Provider
183+
184+
The EFS storage provider uses a user provided EFS file system to store the LanceDB database.
185+
186+
![LanceDB Storage Connector EFS Architecture](../../../assets/lancedb-storage-connector-efs-architecture.png)
187+
188+
<br />
189+
190+
---
191+
192+
### 🏷️ Properties
193+
194+
<br />
195+
196+
##### Supported Inputs
197+
198+
| Mime Type | Description |
199+
| ----------- | ----------- |
200+
| `*/*` | This middleware supports any type of documents. Note that if no embeddings are specified in the document metadata, the document is filtered out. |
201+
202+
##### Supported Outputs
203+
204+
*This middleware does not produce any output.*
205+
206+
##### Supported Compute Types
207+
208+
| Type | Description |
209+
| ----- | ----------- |
210+
| `CPU` | This middleware only supports CPU compute. |
211+
212+
<br />
213+
214+
---
215+
216+
### 📖 Examples
217+
218+
- [Bedrock + LanceDB](https://github.com/awslabs/project-lakechain/tree/main/examples/simple-pipelines/embedding-pipelines/bedrock-lancedb-pipeline) - An example showcasing an embedding pipeline using Amazon Bedrock and LanceDB.

‎docs/src/content/docs/connectors/pinecone-storage-connector.mdx renamed to ‎docs/src/content/docs/vector-stores/pinecone-storage-connector.mdx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ const connector = new PineconeStorageConnector.Builder()
9494

9595
#### Include Text
9696

97-
When the document being processed is a text, you can choose to include the text of the document associated with the embeddings in the Pinecone index. To do so, you can use the `withIncludeText` API. If the document is not a text, this option is ignored.
97+
When the document being processed is a text document, you can choose to include the text of the document associated with the embeddings in the Pinecone index. To do so, you can use the `withIncludeText` API. If the document is not a text, this option is ignored.
9898

9999
> 💁 By default, the text is not included in the index.
100100
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# 🤖 Embedding Pipelines
2+
3+
In this directory we provide several examples of embedding pipelines that showcase how to create vector embeddings for text documents using different embedding models and vector stores on AWS using Project Lakechain.
4+
5+
## 🌟 Examples
6+
7+
Below is a list of the different examples available in this directory.
8+
9+
Pipeline | Description | Model | Vector Store
10+
--- | --- | --- | ---
11+
[Bedrock + LanceDB](bedrock-lancedb-pipeline) | Generate embeddings for text documents using the [Amazon Titan](https://docs.aws.amazon.com/bedrock/latest/userguide/titan-embedding-models.html) text embedding model and store them in a [LanceDB](https://www.pinecone.io/) database. | Amazon Titan Embeddings | LanceDB
12+
[Bedrock + OpenSearch](bedrock-opensearch-pipeline) | Generate embeddings for text documents using the [Amazon Titan](https://docs.aws.amazon.com/bedrock/latest/userguide/titan-embedding-models.html) text embedding model and store them in an [OpenSearch](https://opensearch.org/) index. | Amazon Titan Embeddings | Amazon OpenSearch
13+
[Bedrock + Pinecone](bedrock-pinecone-pipeline) | Generate embeddings for text documents using the [Amazon Titan](https://docs.aws.amazon.com/bedrock/latest/userguide/titan-embedding-models.html) text embedding model and store them in a [Pinecone](https://www.pinecone.io/) vector store. | Amazon Titan Embeddings | Pinecone
14+
[CLIP + S3](clip-embeddings-pipeline) | Generate embeddings for images using the [OpenAI CLIP](https://openai.com/research/clip/) embedding model and store them in an S3 bucket. | OpenAI CLIP | None
15+
[Cohere Embeddings + OpenSearch](cohere-opensearch-pipeline) | Generate embeddings for text documents using the [Cohere](https://cohere.ai/) embedding model and store them in an [OpenSearch](https://opensearch.org/) index. | Cohere on Bedrock | Amazon OpenSearch
16+
[PANNS + OpenSearch](panns-opensearch-pipeline) | Generate embeddings for audio using the [PANNS Inference Model](https://github.com/qiuqiangkong/panns_inference) and store them in an [OpenSearch](https://opensearch.org/) index. | PANNS | Amazon OpenSearch
17+
[Sentence Transformers + OpenSearch](sentence-transformers-pipeline) | Generate embeddings for text documents using the [Sentence Transformers](https://www.sbert.net/) embedding model and store them in an [OpenSearch](https://opensearch.org/) index. | Sentence Transformers | Amazon OpenSearch

‎examples/simple-pipelines/embedding-pipelines/bedrock-lancedb-pipeline/README.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,22 @@
11
# 💾 Bedrock + LanceDB Pipeline
22

3-
> In this example, we showcase how to create vector embeddings for text documents using the [Amazon Bedrock](https://aws.amazon.com/bedrock/) Titan embedding model. The embeddings are stored within a [LanceDB](https://www.pinecone.io/) embedded database that you can query using your own applications.
3+
> In this example, we showcase how to create vector embeddings for text documents (Plain Text, PDF, Office Documents) using the [Amazon Bedrock](https://aws.amazon.com/bedrock/) Titan embedding model. The embeddings are stored within a [LanceDB](https://www.pinecone.io/) embedded database that you can query using your own applications.
44
55
## :dna: Pipeline
66

77
```mermaid
88
flowchart LR
99
Input([Input Bucket]) -.-> S3[S3 Trigger]
1010
S3 --> TextSplitter[Text Splitter]
11+
S3 --> PDF[PDF Text Converter]
12+
S3 --> Pandoc[Pandoc Text Converter]
13+
PDF --> TextSplitter
14+
Pandoc --> TextSplitter
1115
TextSplitter --> Bedrock[Bedrock Embedding Processor]
1216
Bedrock --> LanceDB[LanceDB Storage]
1317
```
1418

15-
In this pipeline we are generating embeddings for text documents and leveraging the EFS storage provider with the LanceDB connector to store embeddings. The use of EFS makes it a good balance between cost and latency provided for storage and retrieval of documents based on their vector embeddings.
19+
In this pipeline we are generating embeddings for text documents, including plain text files, PDFs, or Microsoft Office documents, and leverage the EFS storage provider with the LanceDB connector to store those generated embeddings into an embedded LanceDB database. The use of EFS makes it a good balance between cost and latency provided for storage and retrieval of documents based on their vector embeddings.
1620

1721
> 💁 It is also possible to use the S3 storage provider with the LanceDB connector to store embeddings on S3 for an even lower cost, but at a much higher latency.
1822

‎examples/simple-pipelines/embedding-pipelines/bedrock-lancedb-pipeline/package.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030
"ts-node": "^10.9.2"
3131
},
3232
"dependencies": {
33+
"@project-lakechain/pandoc-text-converter": "*",
34+
"@project-lakechain/pdf-text-converter": "*",
3335
"@project-lakechain/bedrock-embedding-processors": "*",
3436
"@project-lakechain/lancedb-storage-connector": "*",
3537
"@project-lakechain/recursive-character-text-splitter": "*",

‎examples/simple-pipelines/embedding-pipelines/bedrock-lancedb-pipeline/stack.ts

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ import { CacheStorage } from '@project-lakechain/core';
2626
import { S3EventTrigger } from '@project-lakechain/s3-event-trigger';
2727
import { RecursiveCharacterTextSplitter } from '@project-lakechain/recursive-character-text-splitter';
2828
import { TitanEmbeddingProcessor } from '@project-lakechain/bedrock-embedding-processors';
29+
import { PdfTextConverter } from '@project-lakechain/pdf-text-converter';
30+
import { PandocTextConverter } from '@project-lakechain/pandoc-text-converter';
2931
import { LanceDbStorageConnector, EfsStorage } from '@project-lakechain/lancedb-storage-connector';
3032

3133
/**
@@ -34,9 +36,13 @@ import { LanceDbStorageConnector, EfsStorage } from '@project-lakechain/lancedb-
3436
* The pipeline looks as follows:
3537
*
3638
*
37-
* ┌──────┐ ┌───────────────┐ ┌────────────────────┐ ┌───────────┐
38-
* │ S3 ├──►│ Text Splitter ├──►│ Bedrock Embeddings │──►| LanceDB │
39-
* └──────┘ └───────────────┘ └────────────────────┘ └───────────┘
39+
* ┌──────────────────────┐
40+
* ┌─────────────►│ PDF Text Converter ├──────────┐
41+
* │ └──────────────────────┘ |
42+
* | ▼
43+
* ┌──────────────┐ ┌────────────────────┐ ┌───────────────┐ ┌───────────┐ ┌───────────┐
44+
* │ S3 Input ├──►│ Pandoc Converter ├──►│ Text Splitter ├──►│ Bedrock ├──►| LanceDB │
45+
* └──────────────┘ └────────────────────┘ └───────────────┘ └───────────┘ └───────────┘
4046
*
4147
*/
4248
export class BedrockLanceDbPipeline extends cdk.Stack {
@@ -76,7 +82,7 @@ export class BedrockLanceDbPipeline extends cdk.Stack {
7682
subnetType: ec2.SubnetType.PRIVATE_ISOLATED
7783
}
7884
});
79-
85+
8086
// The cache storage.
8187
const cache = new CacheStorage(this, 'CacheStorage', {});
8288

@@ -92,14 +98,34 @@ export class BedrockLanceDbPipeline extends cdk.Stack {
9298
.withBucket(source)
9399
.build();
94100

101+
// Convert PDF documents to text.
102+
const pdfConverter = new PdfTextConverter.Builder()
103+
.withScope(this)
104+
.withIdentifier('PdfConverter')
105+
.withCacheStorage(cache)
106+
.withSource(trigger)
107+
.build();
108+
109+
// Convert text-oriented documents (Docx, Markdown, HTML, etc) to text.
110+
const pandocConverter = new PandocTextConverter.Builder()
111+
.withScope(this)
112+
.withIdentifier('PandocConverter')
113+
.withCacheStorage(cache)
114+
.withSource(trigger)
115+
.build();
116+
95117
// We use the `RecursiveCharacterTextSplitter` to split
96118
// input text into smaller chunks. This is required to ensure
97119
// that the generated embeddings are relevant.
98120
const textSplitter = new RecursiveCharacterTextSplitter.Builder()
99121
.withScope(this)
100122
.withIdentifier('RecursiveCharacterTextSplitter')
101123
.withCacheStorage(cache)
102-
.withSource(trigger)
124+
.withSources([
125+
pdfConverter,
126+
pandocConverter,
127+
trigger
128+
])
103129
.withChunkSize(4096)
104130
.build();
105131

‎examples/simple-pipelines/embedding-pipelines/bedrock-opensearch-pipeline/README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,17 @@
11
# 🤖 Bedrock OpenSearch Pipeline
22

3-
> In this example, we showcase how to create vector embeddings for text documents using the [Amazon Bedrock](https://aws.amazon.com/bedrock/) Titan embedding model. The embeddings are stored within an Amazon OpenSearch index automatically that you can query using your own applications.
3+
> In this example, we showcase how to create vector embeddings for text documents (Plain Text, PDF, Office Documents) using the [Amazon Bedrock](https://aws.amazon.com/bedrock/) Titan embedding model. The embeddings are stored within an Amazon OpenSearch index automatically that you can query using your own applications.
44
55
## :dna: Pipeline
66

77
```mermaid
88
flowchart LR
99
Input([Input Bucket]) -.-> S3[S3 Trigger]
1010
S3 --> TextSplitter[Text Splitter]
11+
S3 --> PDF[PDF Text Converter]
12+
S3 --> Pandoc[Pandoc Text Converter]
13+
PDF --> TextSplitter
14+
Pandoc --> TextSplitter
1115
TextSplitter --> Bedrock[Bedrock Embedding Processor]
1216
Bedrock --> OpenSearch[OpenSearch Vector Storage]
1317
```

‎examples/simple-pipelines/embedding-pipelines/bedrock-opensearch-pipeline/package.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030
"ts-node": "^10.9.2"
3131
},
3232
"dependencies": {
33+
"@project-lakechain/pandoc-text-converter": "*",
34+
"@project-lakechain/pdf-text-converter": "*",
3335
"@project-lakechain/bedrock-embedding-processors": "*",
3436
"@project-lakechain/opensearch-domain": "*",
3537
"@project-lakechain/opensearch-vector-storage-connector": "*",

‎examples/simple-pipelines/embedding-pipelines/bedrock-opensearch-pipeline/stack.ts

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,17 +26,22 @@ import { S3EventTrigger } from '@project-lakechain/s3-event-trigger';
2626
import { RecursiveCharacterTextSplitter } from '@project-lakechain/recursive-character-text-splitter';
2727
import { OpenSearchVectorStorageConnector, OpenSearchVectorIndexDefinition } from '@project-lakechain/opensearch-vector-storage-connector';
2828
import { TitanEmbeddingProcessor, TitanEmbeddingModel } from '@project-lakechain/bedrock-embedding-processors';
29+
import { PdfTextConverter } from '@project-lakechain/pdf-text-converter';
30+
import { PandocTextConverter } from '@project-lakechain/pandoc-text-converter';
2931
import { OpenSearchDomain } from '@project-lakechain/opensearch-domain';
3032

3133
/**
3234
* An example stack showcasing how to use Amazon Bedrock embeddings
3335
* and OpenSearch for storing embeddings.
3436
* The pipeline looks as follows:
3537
*
36-
*
37-
* ┌──────┐ ┌───────────────┐ ┌────────────────────┐ ┌──────────────┐
38-
* │ S3 ├──►│ Text Splitter ├──►│ Bedrock Embeddings │──►| OpenSearch │
39-
* └──────┘ └───────────────┘ └────────────────────┘ └──────────────┘
38+
* ┌──────────────────────┐
39+
* ┌─────────────►│ PDF Text Converter ├──────────┐
40+
* │ └──────────────────────┘ |
41+
* | ▼
42+
* ┌──────────────┐ ┌────────────────────┐ ┌───────────────┐ ┌───────────┐ ┌──────────────┐
43+
* │ S3 Input ├──►│ Pandoc Converter ├──►│ Text Splitter ├──►│ Bedrock ├──►| OpenSearch │
44+
* └──────────────┘ └────────────────────┘ └───────────────┘ └───────────┘ └──────────────┘
4045
*
4146
*/
4247
export class BedrockEmbeddingPipeline extends cdk.Stack {
@@ -86,14 +91,34 @@ export class BedrockEmbeddingPipeline extends cdk.Stack {
8691
.withBucket(source)
8792
.build();
8893

94+
// Convert PDF documents to text.
95+
const pdfConverter = new PdfTextConverter.Builder()
96+
.withScope(this)
97+
.withIdentifier('PdfConverter')
98+
.withCacheStorage(cache)
99+
.withSource(trigger)
100+
.build();
101+
102+
// Convert text-oriented documents (Docx, Markdown, HTML, etc) to text.
103+
const pandocConverter = new PandocTextConverter.Builder()
104+
.withScope(this)
105+
.withIdentifier('PandocConverter')
106+
.withCacheStorage(cache)
107+
.withSource(trigger)
108+
.build();
109+
89110
// We use the `RecursiveCharacterTextSplitter` to split
90111
// input text into smaller chunks. This is required to ensure
91112
// that the generated embeddings are relevant.
92113
const textSplitter = new RecursiveCharacterTextSplitter.Builder()
93114
.withScope(this)
94115
.withIdentifier('RecursiveCharacterTextSplitter')
95116
.withCacheStorage(cache)
96-
.withSource(trigger)
117+
.withSources([
118+
pdfConverter,
119+
pandocConverter,
120+
trigger
121+
])
97122
.withChunkSize(4096)
98123
.build();
99124

‎packages/middlewares/storage-connectors/lancedb-storage-connector/src/definitions/opts.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,11 @@ export const LanceDbStorageConnectorPropsSchema = MiddlewarePropsSchema.extend({
4747
/**
4848
* Whether to include the text associated with the
4949
* embeddings in LanceDB.
50-
* @default true
50+
* @default false
5151
*/
5252
includeText: z
5353
.boolean()
54-
.default(true)
54+
.default(false)
5555
});
5656

5757
// The type of the LanceDbStorageConnectorProps schema.

‎packages/middlewares/storage-connectors/lancedb-storage-connector/src/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ class LanceDbStorageConnectorBuilder extends MiddlewareBuilder {
105105
* with the embeddings in LanceDB.
106106
* @param includeText whether to include the text
107107
* associated with the embeddings in LanceDB.
108+
* @default false
108109
* @returns the builder instance.
109110
*/
110111
public withIncludeText(includeText: boolean) {

0 commit comments

Comments
 (0)
Please sign in to comment.