Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 2c9186c

Browse files
committedJun 26, 2024··
feat: add hashing image processor
1 parent 6a94195 commit 2c9186c

File tree

22 files changed

+1198
-3
lines changed

22 files changed

+1198
-3
lines changed
 

‎docs/astro.config.mjs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,9 @@ export default defineConfig({
111111
}, {
112112
label: 'Laplacian',
113113
link: '/image-processing/laplacian-image-processor'
114+
}, {
115+
label: 'Hashing',
116+
link: '/image-processing/hashing-image-processor'
114117
}]
115118
}, {
116119
label: 'Embeddings',
Loading
1.97 MB
Loading
Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
---
2+
title: Hashing
3+
---
4+
5+
import { Image } from 'astro:assets';
6+
import exampleImage from '../../../assets/image-hashing-example.png';
7+
8+
<span title="Label: Pro" data-view-component="true" class="Label Label--api text-uppercase">
9+
Unstable API
10+
</span>
11+
<span title="Label: Pro" data-view-component="true" class="Label Label--version text-uppercase">
12+
0.7.0
13+
</span>
14+
<span title="Label: Pro" data-view-component="true" class="Label Label--package">
15+
<a target="_blank" href="https://www.npmjs.com/package/@project-lakechain/hashing-image-processor">
16+
@project-lakechain/hashing-image-processor
17+
</a>
18+
</span>
19+
<span class="language-icon">
20+
<svg role="img" viewBox="0 0 24 24" width="30" xmlns="http://www.w3.org/2000/svg" style="fill: #3178C6;"><title>TypeScript</title><path d="M1.125 0C.502 0 0 .502 0 1.125v21.75C0 23.498.502 24 1.125 24h21.75c.623 0 1.125-.502 1.125-1.125V1.125C24 .502 23.498 0 22.875 0zm17.363 9.75c.612 0 1.154.037 1.627.111a6.38 6.38 0 0 1 1.306.34v2.458a3.95 3.95 0 0 0-.643-.361 5.093 5.093 0 0 0-.717-.26 5.453 5.453 0 0 0-1.426-.2c-.3 0-.573.028-.819.086a2.1 2.1 0 0 0-.623.242c-.17.104-.3.229-.393.374a.888.888 0 0 0-.14.49c0 .196.053.373.156.529.104.156.252.304.443.444s.423.276.696.41c.273.135.582.274.926.416.47.197.892.407 1.266.628.374.222.695.473.963.753.268.279.472.598.614.957.142.359.214.776.214 1.253 0 .657-.125 1.21-.373 1.656a3.033 3.033 0 0 1-1.012 1.085 4.38 4.38 0 0 1-1.487.596c-.566.12-1.163.18-1.79.18a9.916 9.916 0 0 1-1.84-.164 5.544 5.544 0 0 1-1.512-.493v-2.63a5.033 5.033 0 0 0 3.237 1.2c.333 0 .624-.03.872-.09.249-.06.456-.144.623-.25.166-.108.29-.234.373-.38a1.023 1.023 0 0 0-.074-1.089 2.12 2.12 0 0 0-.537-.5 5.597 5.597 0 0 0-.807-.444 27.72 27.72 0 0 0-1.007-.436c-.918-.383-1.602-.852-2.053-1.405-.45-.553-.676-1.222-.676-2.005 0-.614.123-1.141.369-1.582.246-.441.58-.804 1.004-1.089a4.494 4.494 0 0 1 1.47-.629 7.536 7.536 0 0 1 1.77-.201zm-15.113.188h9.563v2.166H9.506v9.646H6.789v-9.646H3.375z"/></svg>
21+
</span>
22+
<div style="margin-top: 26px"></div>
23+
24+
---
25+
26+
The hashing image processor makes it possible to enrich the metadata of images with hash values associated with the visual representation of an image.
27+
This middleware supports different hashing algorithms, including [average hashing](https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html), [perceptual hashing](https://en.wikipedia.org/wiki/Perceptual_hashing), [difference hashing](https://github.com/Tom64b/dHash), [wavelet hashing](https://fullstackml.com/wavelet-image-hash-in-python-3504fdd282b5), and color hashing.
28+
29+
Thoses hashing algorithm can be used to compare how different images are from a visual standpoint.
30+
They provide a more computationally efficient way to compare images, compared to vector embeddings which also take into account the semantic aspect of an image.
31+
32+
<br />
33+
<p align="center">
34+
<img width="680" src={exampleImage.src} />
35+
</p>
36+
37+
<p align="center" style="font-size: 14px">
38+
<em>An example using average hashing.</em>
39+
<br />
40+
<em>Credits <a href="https://unsplash.com/fr/photos/une-chaise-avec-un-magazine-dessus-6Y4wlHeiGhM#:~:text=Remerciez-,Branislav%20Rodman,-sur%20les%20r%C3%A9seaux">Branislav Rodman</a> on <a href="https://unsplash.com/fr/photos/une-chaise-avec-un-magazine-dessus-6Y4wlHeiGhM">Unsplash</a></em>
41+
</p>
42+
<br />
43+
44+
<br />
45+
46+
---
47+
48+
### #️⃣ Computing Hashes
49+
50+
To use this middleware, you import it in your CDK stack and instantiate it as part of a pipeline.
51+
52+
```typescript
53+
import { HashingImageProcessor } from '@project-lakechain/hashing-image-processor';
54+
import { CacheStorage } from '@project-lakechain/core';
55+
56+
class Stack extends cdk.Stack {
57+
constructor(scope: cdk.Construct, id: string) {
58+
const cache = new CacheStorage(this, 'Cache');
59+
60+
// Computes the different image hashes based on all supported algorithms.
61+
const hashing = new HashingImageProcessor.Builder()
62+
.withScope(this)
63+
.withIdentifier('HashingImageProcessor')
64+
.withCacheStorage(cache)
65+
.withSource(source) // 👈 Specify a data source
66+
.build();
67+
}
68+
}
69+
```
70+
71+
<br />
72+
73+
---
74+
75+
#### Selecting Algorithms
76+
77+
You can explicitely select which hashing algorithm to enable or not when enriching the document metadata with the different types of image hashes.
78+
79+
> 💁 By default, all hashing algorithms are enabled.
80+
81+
```typescript
82+
import { HashingImageProcessor } from '@project-lakechain/hashing-image-processor';
83+
84+
const hashing = new HashingImageProcessor.Builder()
85+
.withScope(this)
86+
.withIdentifier('LaplacianProcessor')
87+
.withCacheStorage(cache)
88+
.withSource(source)
89+
// Optionally specify which algorithms to use.
90+
.withAverageHashing(true)
91+
.withPerceptualHashing(true)
92+
.withDifferenceHashing(false)
93+
.withWaveletHashing(false)
94+
.withColorHashing(false)
95+
.build();
96+
```
97+
98+
<br />
99+
100+
---
101+
102+
### 📄 Output
103+
104+
The Hashing image processor does not modify or alter source images in any way. It instead enriches the metadata of processed documents by setting the hash values associated with each of the enabled hashing algorithms.
105+
106+
<details>
107+
<summary>💁 Click to expand example</summary>
108+
109+
> ℹ️ Below is an example of a [CloudEvent](/project-lakechain/general/events) emitted by the Hashing image processor.
110+
111+
```json
112+
{
113+
"specversion": "1.0",
114+
"id": "1780d5de-fd6f-4530-98d7-82ebee85ea39",
115+
"type": "document-created",
116+
"time": "2023-10-22T13:19:10.657Z",
117+
"data": {
118+
"chainId": "6ebf76e4-f70c-440c-98f9-3e3e7eb34c79",
119+
"source": {
120+
"url": "s3://bucket/image.png",
121+
"type": "image/png",
122+
"size": 245328,
123+
"etag": "1243cbd6cf145453c8b5519a2ada4779"
124+
},
125+
"document": {
126+
"url": "s3://bucket/image.png",
127+
"type": "image/png",
128+
"size": 245328,
129+
"etag": "1243cbd6cf145453c8b5519a2ada4779"
130+
},
131+
"metadata": {
132+
"properties": {
133+
"kind": "image",
134+
"attrs": {
135+
"hashes": {
136+
"average": "00077ffbf2fefee0",
137+
"perceptual": "f53a175d6848d9c4",
138+
"difference": "1c4ccea3269084c8",
139+
"wavelet": "000707d1f2fefee0",
140+
"color": "06e00000040"
141+
}
142+
}
143+
}
144+
},
145+
"callStack": []
146+
}
147+
}
148+
```
149+
150+
</details>
151+
152+
<br />
153+
154+
---
155+
156+
### 🏗️ Architecture
157+
158+
This middleware runs within a Lambda compute, and packages the [`imagehash`](https://github.com/JohannesBuchner/imagehash/tree/master) to compute the Laplacian variance of images.
159+
160+
![Architecture](../../../assets/hashing-image-processor-architecture.png)
161+
162+
<br />
163+
164+
---
165+
166+
### 🏷️ Properties
167+
168+
<br />
169+
170+
##### Supported Inputs
171+
172+
| Mime Type | Description |
173+
| ----------- | ----------- |
174+
| `image/jpeg` | JPEG image |
175+
| `image/png` | PNG image |
176+
| `image/bmp` | BMP image |
177+
| `image/webp` | WebP image |
178+
179+
##### Supported Outputs
180+
181+
| Mime Type | Description |
182+
| ----------- | ----------- |
183+
| `image/jpeg` | JPEG image |
184+
| `image/png` | PNG image |
185+
| `image/bmp` | BMP image |
186+
| `image/webp` | WebP image |
187+
188+
##### Supported Compute Types
189+
190+
| Type | Description |
191+
| ----- | ----------- |
192+
| `CPU` | This middleware only supports CPU compute. |
193+
194+
<br />
195+
196+
---
197+
198+
### 📖 Examples
199+
200+
- [Image Hashing Pipeline](https://github.com/awslabs/project-lakechain/tree/main/examples/simple-pipelines/image-hashing-pipeline) - An example showcasing how to compute the hash of images.

‎docs/src/content/docs/image-processing/laplacian-image-processor.mdx

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,6 @@ The Laplacian image processor does not modify or alter source images in any way.
143143
"etag": "1243cbd6cf145453c8b5519a2ada4779"
144144
},
145145
"metadata": {
146-
"description": "A man sitting on a wooden chair in a cozy room.",
147146
"properties": {
148147
"kind": "image",
149148
"attrs": {
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
# #️⃣ Image Hashing Pipeline
2+
3+
> In this example, we demonstrate how to compute visual hashes associated with images within a Lakechain pipeline, using different hashing algorithms.
4+
5+
## :dna: Pipeline
6+
7+
```mermaid
8+
flowchart LR
9+
Input([Input Bucket]) -.-> S3[S3 Trigger]
10+
S3 -. Image .-> Hashing[Hashing Image Processor]
11+
Hashing -.-> Destination[S3 Storage Connector]
12+
Destination -.-> Output([Output Bucket])
13+
```
14+
15+
## ❓ What is Happening
16+
17+
This example demonstrates how to compute the hash of images using the `Hashing Image Processor` middleware.
18+
19+
The pipeline is triggered when an image is uploaded to the source S3 bucket. The image is then processed by the `Hashing Image Processor` middleware which computes the hashes of the image using different hashing algorithms by default. The pipeline then stores the image and its metadata in the destination S3 bucket.
20+
21+
<br />
22+
<p align="center">
23+
<img width="700" src="../../../docs/src/assets/image-hashing-example.png">
24+
</p>
25+
<br />
26+
27+
## 📝 Requirements
28+
29+
The following requirements are needed to deploy the infrastructure associated with this pipeline:
30+
31+
- You need access to a development AWS account.
32+
- [AWS CDK](https://docs.aws.amazon.com/cdk/latest/guide/getting_started.html#getting_started_install) is required to deploy the infrastructure.
33+
- [Docker](https://docs.docker.com/get-docker/) is required to be running to build middlewares.
34+
- [Node.js](https://nodejs.org/en/download/) v18+ and NPM.
35+
- [Python](https://www.python.org/downloads/) v3.8+ and [Pip](https://pip.pypa.io/en/stable/installation/).
36+
37+
## 🚀 Deploy
38+
39+
Head to the directory [`examples/simple-pipelines/image-hashing-example`](/examples/simple-pipelines/image-hashing-example) in the repository and run the following commands to build the example:
40+
41+
```bash
42+
npm install
43+
npm run build-pkg
44+
```
45+
46+
You can then deploy the example to your account (ensure your AWS CDK is configured with the appropriate AWS credentials and AWS region):
47+
48+
```bash
49+
npm run deploy
50+
```
51+
52+
## 🧹 Clean up
53+
54+
Don't forget to clean up the resources created by this example by running the following command:
55+
56+
```bash
57+
npm run destroy
58+
```
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
{
2+
"app": "npx ts-node --prefer-ts-exts stack.ts",
3+
"watch": {
4+
"include": ["**"],
5+
"exclude": [
6+
"README.md",
7+
"cdk*.json",
8+
"**/*.d.ts",
9+
"**/*.js",
10+
"tsconfig.json",
11+
"package*.json",
12+
"yarn.lock",
13+
"node_modules",
14+
"test",
15+
"**/*.zip"
16+
]
17+
},
18+
"context": {
19+
"@aws-cdk/aws-apigateway:usagePlanKeyOrderInsensitiveId": true,
20+
"@aws-cdk/core:stackRelativeExports": true,
21+
"@aws-cdk/aws-rds:lowercaseDbIdentifier": true,
22+
"@aws-cdk/aws-lambda:recognizeVersionProps": true,
23+
"@aws-cdk/aws-lambda:recognizeLayerVersion": true,
24+
"@aws-cdk/aws-cloudfront:defaultSecurityPolicyTLSv1.2_2021": true,
25+
"@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true,
26+
"@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true,
27+
"@aws-cdk/core:checkSecretUsage": true,
28+
"@aws-cdk/aws-iam:minimizePolicies": true,
29+
"@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true,
30+
"@aws-cdk/core:validateSnapshotRemovalPolicy": true,
31+
"@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true,
32+
"@aws-cdk/aws-s3:createDefaultLoggingPolicy": true,
33+
"@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true,
34+
"@aws-cdk/aws-apigateway:disableCloudWatchRole": true,
35+
"@aws-cdk/core:enablePartitionLiterals": true,
36+
"@aws-cdk/customresources:installLatestAwsSdkDefault": false,
37+
"@aws-cdk/core:target-partitions": ["aws", "aws-cn"]
38+
}
39+
}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
{
2+
"name": "image-hashing-pipeline",
3+
"description": "An example showcasing how to compute the hash of images.",
4+
"version": "0.7.0",
5+
"private": true,
6+
"scripts": {
7+
"build": "tsc",
8+
"build-pkg": "npx lerna run build --scope=image-hashing-pipeline --include-dependencies",
9+
"clean": "npx rimraf dist/ cdk.out/ node_modules/",
10+
"audit": "npm audit && npm run synth --silent | cfn_nag",
11+
"lint": "npx eslint .",
12+
"synth": "npx cdk synth",
13+
"deploy": "npx cdk deploy",
14+
"hotswap": "npx cdk deploy --hotswap",
15+
"destroy": "npx cdk destroy --all"
16+
},
17+
"author": {
18+
"name": "Amazon Web Services",
19+
"url": "https://aws.amazon.com"
20+
},
21+
"repository": {
22+
"type": "git",
23+
"url": "git://github.com/awslabs/project-lakechain"
24+
},
25+
"license": "Apache-2.0",
26+
"devDependencies": {
27+
"@types/node": "^20.8.10",
28+
"esbuild": "^0.21.5",
29+
"ts-jest": "^29.0.0",
30+
"ts-node": "^10.9.2"
31+
},
32+
"dependencies": {
33+
"@project-lakechain/s3-event-trigger": "*",
34+
"@project-lakechain/hashing-image-processor": "*",
35+
"@project-lakechain/s3-storage-connector": "*"
36+
},
37+
"peerDependencies": {
38+
"aws-cdk-lib": "^2.147.1",
39+
"constructs": "^10.3.0"
40+
}
41+
}
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
#!/usr/bin/env node
2+
3+
/*
4+
* Copyright (C) 2023 Amazon.com, Inc. or its affiliates.
5+
*
6+
* Licensed under the Apache License, Version 2.0 (the "License");
7+
* you may not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
import * as cdk from 'aws-cdk-lib';
20+
import * as s3 from 'aws-cdk-lib/aws-s3';
21+
22+
import { Construct } from 'constructs';
23+
import { CacheStorage } from '@project-lakechain/core';
24+
import { S3EventTrigger } from '@project-lakechain/s3-event-trigger';
25+
import { HashingImageProcessor } from '@project-lakechain/hashing-image-processor';
26+
import { S3StorageConnector } from '@project-lakechain/s3-storage-connector';
27+
28+
/**
29+
* An example stack showcasing how to compute the hash
30+
* of images using different hashing algorithms.
31+
*
32+
* The pipeline looks as follows:
33+
*
34+
* ┌──────────────┐ ┌─────────────────────┐ ┌─────────────┐
35+
* │ S3 Input ├───►│ Hashing Processor ├───►│ S3 Output │
36+
* └──────────────┘ └─────────────────────┘ └─────────────┘
37+
*
38+
*/
39+
export class ImageHashingPipeline extends cdk.Stack {
40+
41+
/**
42+
* Stack constructor.
43+
*/
44+
constructor(scope: Construct, id: string, env: cdk.StackProps) {
45+
super(scope, id, {
46+
description: 'A pipeline computing the hash of images using different algorithms.',
47+
...env
48+
});
49+
50+
///////////////////////////////////////////
51+
/////// S3 Storage ///////
52+
///////////////////////////////////////////
53+
54+
// The source bucket.
55+
const source = new s3.Bucket(this, 'Bucket', {
56+
encryption: s3.BucketEncryption.S3_MANAGED,
57+
blockPublicAccess: s3.BlockPublicAccess.BLOCK_ALL,
58+
autoDeleteObjects: true,
59+
removalPolicy: cdk.RemovalPolicy.DESTROY,
60+
enforceSSL: true
61+
});
62+
63+
// The destination bucket.
64+
const destination = new s3.Bucket(this, 'Destination', {
65+
encryption: s3.BucketEncryption.S3_MANAGED,
66+
blockPublicAccess: s3.BlockPublicAccess.BLOCK_ALL,
67+
autoDeleteObjects: true,
68+
removalPolicy: cdk.RemovalPolicy.DESTROY,
69+
enforceSSL: true
70+
});
71+
72+
// The cache storage.
73+
const cache = new CacheStorage(this, 'Cache', {});
74+
75+
///////////////////////////////////////////
76+
/////// Lakechain Pipeline ///////
77+
///////////////////////////////////////////
78+
79+
// Create the S3 trigger monitoring the bucket
80+
// for uploaded objects.
81+
const trigger = new S3EventTrigger.Builder()
82+
.withScope(this)
83+
.withIdentifier('Trigger')
84+
.withCacheStorage(cache)
85+
.withBucket(source)
86+
.build();
87+
88+
// Compute the hash of images using different algorithms.
89+
const hash = new HashingImageProcessor.Builder()
90+
.withScope(this)
91+
.withIdentifier('HashingImageProcessor')
92+
.withCacheStorage(cache)
93+
.withSource(trigger)
94+
// Optionally specify which algorithms to use.
95+
.withAverageHashing(true)
96+
.withPerceptualHashing(true)
97+
.withDifferenceHashing(true)
98+
.withWaveletHashing(true)
99+
.withColorHashing(true)
100+
.build();
101+
102+
// Write the images to the destination bucket.
103+
new S3StorageConnector.Builder()
104+
.withScope(this)
105+
.withIdentifier('SharpStorageConnector')
106+
.withCacheStorage(cache)
107+
.withSource(hash)
108+
.withDestinationBucket(destination)
109+
.build();
110+
111+
// Display the source bucket information in the console.
112+
new cdk.CfnOutput(this, 'SourceBucketName', {
113+
description: 'The name of the source bucket.',
114+
value: source.bucketName
115+
});
116+
117+
// Display the destination bucket information in the console.
118+
new cdk.CfnOutput(this, 'DestinationBucketName', {
119+
description: 'The name of the destination bucket.',
120+
value: destination.bucketName
121+
});
122+
}
123+
}
124+
125+
// Creating the CDK application.
126+
const app = new cdk.App();
127+
128+
// Environment variables.
129+
const account = process.env.CDK_DEFAULT_ACCOUNT ?? process.env.AWS_DEFAULT_ACCOUNT;
130+
const region = process.env.CDK_DEFAULT_REGION ?? process.env.AWS_DEFAULT_REGION;
131+
132+
// Deploy the stack.
133+
new ImageHashingPipeline(app, 'ImageHashingPipeline', {
134+
env: {
135+
account,
136+
region
137+
}
138+
});
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"extends": "../../../tsconfig.json",
3+
"compilerOptions": {
4+
"outDir": "./dist"
5+
},
6+
"include": ["./*.ts"]
7+
}

‎package-lock.json

Lines changed: 46 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
/*
2+
* Copyright (C) 2023 Amazon.com, Inc. or its affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
module.exports = {
18+
testEnvironment: 'node',
19+
roots: ['<rootDir>/test'],
20+
testMatch: ['**/*.test.ts'],
21+
transform: {
22+
'^.+\\.tsx?$': 'ts-jest'
23+
}
24+
};
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
{
2+
"name": "@project-lakechain/hashing-image-processor",
3+
"description": "Computes the hashes of images using different algorithms.",
4+
"version": "0.7.0",
5+
"main": "dist/index.js",
6+
"types": "dist/index.d.ts",
7+
"files": [
8+
"dist/"
9+
],
10+
"publishConfig": {
11+
"access": "public"
12+
},
13+
"scripts": {
14+
"build": "npx tsc",
15+
"postbuild": "npx shx cp -r ./src/lambdas dist/",
16+
"clean": "npx rimraf dist/ node_modules/",
17+
"lint": "npx eslint .",
18+
"prettier:check": "npx prettier --check ."
19+
},
20+
"author": {
21+
"name": "Amazon Web Services",
22+
"url": "https://aws.amazon.com"
23+
},
24+
"repository": {
25+
"type": "git",
26+
"url": "git://github.com/awslabs/project-lakechain"
27+
},
28+
"bugs": {
29+
"url": "https://github.com/awslabs/project-lakechain/issues"
30+
},
31+
"license": "Apache-2.0",
32+
"homepage": "https://awslabs.github.io/project-lakechain/",
33+
"devDependencies": {
34+
"@types/jest": "^29.0.0",
35+
"@types/node": "^20.2.1",
36+
"esbuild": "^0.21.5",
37+
"rimraf": "^5.0.1",
38+
"shx": "^0.3.4"
39+
},
40+
"dependencies": {
41+
"@project-lakechain/core": "^0.7.0"
42+
},
43+
"peerDependencies": {
44+
"aws-cdk-lib": "^2.147.1",
45+
"constructs": "^10.3.0"
46+
},
47+
"lint-staged": {
48+
"*.{ts, json}": "npx eslint",
49+
"*.{js,json,md}": "npx prettier --check"
50+
}
51+
}
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
/*
2+
* Copyright (C) 2023 Amazon.com, Inc. or its affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
import { z } from 'zod';
18+
import { MiddlewarePropsSchema } from '@project-lakechain/core/middleware';
19+
20+
/**
21+
* The Hashing image processor properties.
22+
*/
23+
export const HashingImageProcessorPropsSchema = MiddlewarePropsSchema.extend({
24+
25+
/**
26+
* Whether to compute the average hash of images.
27+
* @default true
28+
*/
29+
averageHashing: z
30+
.boolean()
31+
.default(true),
32+
33+
/**
34+
* Whether to compute the perceptual hash of images.
35+
* @default true
36+
*/
37+
perceptualHashing: z
38+
.boolean()
39+
.default(true),
40+
41+
/**
42+
* Whether to compute the difference hash of images.
43+
* @default true
44+
*/
45+
differenceHashing: z
46+
.boolean()
47+
.default(true),
48+
49+
/**
50+
* Whether to compute the wavelet hash of images.
51+
* @default true
52+
*/
53+
waveletHashing: z
54+
.boolean()
55+
.default(true),
56+
57+
/**
58+
* Whether to compute the color hash of images.
59+
* @default true
60+
*/
61+
colorHashing: z
62+
.boolean()
63+
.default(true)
64+
});
65+
66+
// Export the `HashingImageProcessorProps` type.
67+
export type HashingImageProcessorProps = z.infer<typeof HashingImageProcessorPropsSchema>;
Lines changed: 272 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,272 @@
1+
/*
2+
* Copyright (C) 2023 Amazon.com, Inc. or its affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
import path from 'path';
18+
19+
import * as cdk from 'aws-cdk-lib';
20+
import * as lambda from 'aws-cdk-lib/aws-lambda';
21+
import * as sources from 'aws-cdk-lib/aws-lambda-event-sources';
22+
import * as iam from 'aws-cdk-lib/aws-iam';
23+
24+
import { Construct } from 'constructs';
25+
import { ServiceDescription } from '@project-lakechain/core/service';
26+
import { ComputeType } from '@project-lakechain/core/compute-type';
27+
import { when } from '@project-lakechain/core/dsl/vocabulary/conditions';
28+
import { HashingImageProcessorProps, HashingImageProcessorPropsSchema } from './definitions/opts';
29+
30+
import {
31+
Middleware,
32+
MiddlewareBuilder,
33+
LAMBDA_INSIGHTS_VERSION,
34+
NAMESPACE
35+
} from '@project-lakechain/core/middleware';
36+
37+
/**
38+
* The service description.
39+
*/
40+
const description: ServiceDescription = {
41+
name: 'hashing-image-processor',
42+
description: 'Computes the hashes of images using different algorithms.',
43+
version: '0.7.0',
44+
attrs: {}
45+
};
46+
47+
/**
48+
* The maximum time the processing lambda
49+
* is allowed to run.
50+
*/
51+
const PROCESSING_TIMEOUT = cdk.Duration.seconds(30);
52+
53+
/**
54+
* The default memory size to allocate for the compute.
55+
*/
56+
const DEFAULT_MEMORY_SIZE = 512;
57+
58+
/**
59+
* Builder for the `HashingImageProcessor` middleware.
60+
*/
61+
class HashingImageProcessorBuilder extends MiddlewareBuilder {
62+
private providerProps: Partial<HashingImageProcessorProps> = {};
63+
64+
/**
65+
* Sets whether to compute the average hash of images.
66+
* @default true
67+
* @param averageHashing whether to compute the average hash of images.
68+
* @returns the builder instance.
69+
*/
70+
public withAverageHashing(averageHashing: boolean): HashingImageProcessorBuilder {
71+
this.providerProps.averageHashing = averageHashing;
72+
return (this);
73+
}
74+
75+
/**
76+
* Sets whether to compute the perceptual hash of images.
77+
* @default true
78+
* @param perceptualHashing whether to compute the perceptual hash of images.
79+
* @returns the builder instance.
80+
*/
81+
public withPerceptualHashing(perceptualHashing: boolean): HashingImageProcessorBuilder {
82+
this.providerProps.perceptualHashing = perceptualHashing;
83+
return (this);
84+
}
85+
86+
/**
87+
* Sets whether to compute the difference hash of images.
88+
* @default true
89+
* @param differenceHashing whether to compute the difference hash of images.
90+
* @returns the builder instance.
91+
*/
92+
public withDifferenceHashing(differenceHashing: boolean): HashingImageProcessorBuilder {
93+
this.providerProps.differenceHashing = differenceHashing;
94+
return (this);
95+
}
96+
97+
/**
98+
* Sets whether to compute the wavelet hash of images.
99+
* @default true
100+
* @param waveletHashing whether to compute the wavelet hash of images.
101+
* @returns the builder instance.
102+
*/
103+
public withWaveletHashing(waveletHashing: boolean): HashingImageProcessorBuilder {
104+
this.providerProps.waveletHashing = waveletHashing;
105+
return (this);
106+
}
107+
108+
/**
109+
* Sets whether to compute the color hash of images.
110+
* @default true
111+
* @param colorHashing whether to compute the color hash of images.
112+
* @returns the builder instance.
113+
*/
114+
public withColorHashing(colorHashing: boolean): HashingImageProcessorBuilder {
115+
this.providerProps.colorHashing = colorHashing;
116+
return (this);
117+
}
118+
119+
/**
120+
* @returns a new instance of the `HashingImageProcessor`
121+
* service constructed with the given parameters.
122+
*/
123+
public build(): HashingImageProcessor {
124+
return (new HashingImageProcessor(
125+
this.scope,
126+
this.identifier, {
127+
...this.providerProps as HashingImageProcessorProps,
128+
...this.props
129+
}
130+
));
131+
}
132+
}
133+
134+
/**
135+
* A middleware allowing to compute the Laplacian variance
136+
* of images.
137+
*/
138+
export class HashingImageProcessor extends Middleware {
139+
140+
/**
141+
* The event processing lambda function.
142+
*/
143+
public eventProcessor: lambda.IFunction;
144+
145+
/**
146+
* The builder for the `HashingImageProcessor` service.
147+
*/
148+
static Builder = HashingImageProcessorBuilder;
149+
150+
/**
151+
* Provider constructor.
152+
*/
153+
constructor(scope: Construct, id: string, private props: HashingImageProcessorProps) {
154+
super(scope, id, description, {
155+
...props,
156+
queueVisibilityTimeout: cdk.Duration.seconds(
157+
3 * PROCESSING_TIMEOUT.toSeconds()
158+
)
159+
});
160+
161+
// Validating the properties.
162+
this.props = this.parse(HashingImageProcessorPropsSchema, props);
163+
164+
///////////////////////////////////////////
165+
/////// Processing Function ///////
166+
///////////////////////////////////////////
167+
168+
this.eventProcessor = new lambda.DockerImageFunction(this, 'Compute', {
169+
description: 'Computes the hashes of images using different algorithms.',
170+
code: lambda.DockerImageCode.fromImageAsset(
171+
path.resolve(__dirname, 'lambdas', 'processor')
172+
),
173+
vpc: this.props.vpc,
174+
memorySize: this.props.maxMemorySize ?? DEFAULT_MEMORY_SIZE,
175+
timeout: PROCESSING_TIMEOUT,
176+
architecture: lambda.Architecture.X86_64,
177+
tracing: lambda.Tracing.ACTIVE,
178+
environmentEncryption: this.props.kmsKey,
179+
logGroup: this.logGroup,
180+
insightsVersion: props.cloudWatchInsights ?
181+
LAMBDA_INSIGHTS_VERSION :
182+
undefined,
183+
environment: {
184+
POWERTOOLS_SERVICE_NAME: description.name,
185+
POWERTOOLS_METRICS_NAMESPACE: NAMESPACE,
186+
SNS_TARGET_TOPIC: this.eventBus.topicArn,
187+
LAKECHAIN_CACHE_STORAGE: this.props.cacheStorage.id(),
188+
AVERAGE_HASHING: this.props.averageHashing ? 'true' : 'false',
189+
PERCEPTUAL_HASHING: this.props.perceptualHashing ? 'true' : 'false',
190+
DIFFERENCE_HASHING: this.props.differenceHashing ? 'true' : 'false',
191+
WAVELET_HASHING: this.props.waveletHashing ? 'true' : 'false',
192+
COLOR_HASHING: this.props.colorHashing ? 'true' : 'false'
193+
}
194+
});
195+
196+
// Allows this construct to act as a `IGrantable`
197+
// for other middlewares to grant the processing
198+
// lambda permissions to access their resources.
199+
this.grantPrincipal = this.eventProcessor.grantPrincipal;
200+
201+
// Plug the SQS queue into the lambda function.
202+
this.eventProcessor.addEventSource(new sources.SqsEventSource(this.eventQueue, {
203+
batchSize: this.props.batchSize ?? 1,
204+
reportBatchItemFailures: true
205+
}));
206+
207+
// Function permissions.
208+
this.eventBus.grantPublish(this.eventProcessor);
209+
210+
super.bind();
211+
}
212+
213+
/**
214+
* Allows a grantee to read from the processed documents
215+
* generated by this middleware.
216+
*/
217+
grantReadProcessedDocuments(grantee: iam.IGrantable): iam.Grant {
218+
// Since this middleware simply passes through the data
219+
// from the previous middleware, we grant any subsequent
220+
// middlewares in the pipeline read access to the
221+
// data of all source middlewares.
222+
for (const source of this.sources) {
223+
source.grantReadProcessedDocuments(grantee);
224+
}
225+
return ({} as iam.Grant);
226+
}
227+
228+
/**
229+
* @returns an array of mime-types supported as input
230+
* type by the data producer.
231+
*/
232+
supportedInputTypes(): string[] {
233+
return ([
234+
'image/jpeg',
235+
'image/png',
236+
'image/bmp',
237+
'image/webp'
238+
]);
239+
}
240+
241+
/**
242+
* @returns an array of mime-types supported as output
243+
* type by the data producer.
244+
*/
245+
supportedOutputTypes(): string[] {
246+
return (this.supportedInputTypes());
247+
}
248+
249+
/**
250+
* @returns the supported compute types by a given
251+
* middleware.
252+
*/
253+
supportedComputeTypes(): ComputeType[] {
254+
return ([
255+
ComputeType.CPU
256+
]);
257+
}
258+
259+
/**
260+
* @returns the middleware conditional statement defining
261+
* in which conditions this middleware should be executed.
262+
* In this case, we want the middleware to only be invoked
263+
* when the document mime-type is supported, and the event
264+
* type is `document-created`.
265+
*/
266+
conditional() {
267+
return (super
268+
.conditional()
269+
.and(when('type').equals('document-created'))
270+
);
271+
}
272+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
FROM public.ecr.aws/lambda/python:3.11
2+
3+
# Add requirements to the layer.
4+
COPY requirements.txt ${LAMBDA_TASK_ROOT}
5+
6+
# Install Python dependencies.
7+
RUN pip install --no-cache-dir -r requirements.txt
8+
9+
# Copy application files in the container.
10+
COPY . ${LAMBDA_TASK_ROOT}
11+
12+
CMD ["index.lambda_handler"]
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
# Copyright (C) 2023 Amazon.com, Inc. or its affiliates.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import os
16+
import io
17+
import json
18+
import boto3
19+
import imagehash
20+
21+
from typing import Optional
22+
from PIL import Image
23+
from urllib.parse import urlparse, unquote
24+
from aws_lambda_powertools import Logger, Tracer
25+
from aws_lambda_powertools.utilities.data_classes import event_source, SQSEvent
26+
from aws_lambda_powertools.utilities.typing import LambdaContext
27+
from aws_lambda_powertools.utilities.data_classes.sqs_event import SQSRecord
28+
from publish import publish_event
29+
30+
from aws_lambda_powertools.utilities.batch import (
31+
BatchProcessor,
32+
EventType,
33+
process_partial_response
34+
)
35+
36+
# Environment variables.
37+
SERVICE_NAME = os.getenv('POWERTOOLS_SERVICE_NAME')
38+
PROCESSED_FILES_BUCKET = os.getenv('PROCESSED_FILES_BUCKET')
39+
AVERAGE_HASHING = os.getenv('AVERAGE_HASHING', 'true') == 'true'
40+
PERCEPTUAL_HASHING = os.getenv('PERCEPTUAL_HASHING', 'true') == 'true'
41+
DIFFERENCE_HASHING = os.getenv('DIFFERENCE_HASHING', 'true') == 'true'
42+
WAVELET_HASHING = os.getenv('WAVELET_HASHING', 'true') == 'true'
43+
COLOR_HASHING = os.getenv('COLOR_HASHING', 'true') == 'true'
44+
45+
# Runtime function attributes.
46+
logger = Logger(service=SERVICE_NAME)
47+
tracer = Tracer(service=SERVICE_NAME)
48+
s3_client = boto3.client('s3')
49+
sns_client = boto3.client('sns')
50+
processor = BatchProcessor(event_type=EventType.SQS)
51+
52+
try:
53+
ANTIALIAS = Image.Resampling.LANCZOS
54+
except AttributeError:
55+
# deprecated in pillow 10
56+
# https://pillow.readthedocs.io/en/stable/deprecations.html
57+
ANTIALIAS = Image.ANTIALIAS
58+
59+
60+
def merge(dct, merge_dct):
61+
""" Recursive dict merge. Inspired by :meth:``dict.update()``, instead of
62+
updating only top-level keys, dict_merge recurses down into dicts nested
63+
to an arbitrary depth, updating keys. The ``merge_dct`` is merged into
64+
``dct``.
65+
:param dct: dict onto which the merge is executed
66+
:param merge_dct: dct merged into dct
67+
:return: None
68+
"""
69+
for k, _ in merge_dct.items():
70+
if (k in dct and isinstance(dct[k], dict) and isinstance(merge_dct[k], dict)):
71+
merge(dct[k], merge_dct[k])
72+
else:
73+
if k not in dct:
74+
dct[k] = merge_dct[k]
75+
76+
77+
def load_image(url) -> bytes:
78+
"""
79+
Loads the image from the given `url` in memory.
80+
:param url: The URL of the image to load.
81+
"""
82+
bucket = unquote(url.netloc)
83+
key = unquote(url.path).lstrip('/')
84+
response = s3_client.get_object(Bucket=bucket, Key=key)
85+
return response['Body'].read()
86+
87+
88+
def process_document(event: dict) -> dict:
89+
"""
90+
Computes the image hashes for the given document.
91+
:param event: the received cloud event.
92+
"""
93+
94+
document = event['data']['document']
95+
url = urlparse(document['url'])
96+
97+
# Decode the image in memory.
98+
buffer = load_image(url)
99+
image = Image.open(io.BytesIO(buffer))
100+
101+
# Only resize if the image dimensions are greater than 1024x1024
102+
max_size = (1024, 1024)
103+
if image.size[0] > max_size[0] or image.size[1] > max_size[1]:
104+
image.thumbnail(max_size, ANTIALIAS)
105+
106+
# Update the document metadata.
107+
merge(event['data']['metadata'], {
108+
'properties': {
109+
'kind': 'image',
110+
'attrs': {
111+
'hashes': {}
112+
}
113+
}
114+
})
115+
116+
# Set the hashes object.
117+
hashes = event['data']['metadata']['properties']['attrs']['hashes']
118+
119+
# Compute the average hashing of the image.
120+
if AVERAGE_HASHING:
121+
hashes['average'] = str(imagehash.average_hash(image))
122+
123+
# Compute the perceptual hashing of the image.
124+
if PERCEPTUAL_HASHING:
125+
hashes['perceptual'] = str(imagehash.phash(image))
126+
127+
# Compute the difference hashing of the image.
128+
if DIFFERENCE_HASHING:
129+
hashes['difference'] = str(imagehash.dhash(image))
130+
131+
# Compute the wavelet hashing of the image.
132+
if WAVELET_HASHING:
133+
hashes['wavelet'] = str(imagehash.whash(image))
134+
135+
# Compute the color hashing of the image.
136+
if COLOR_HASHING:
137+
hashes['color'] = str(imagehash.colorhash(image))
138+
139+
return event
140+
141+
142+
def record_handler(record: SQSRecord, _: Optional[LambdaContext] = None):
143+
"""
144+
Process the record associated with the SQS event,
145+
and publishes the new document to the next middlewares.
146+
:param record: The SQS record to process.
147+
:param lambda_context: The Lambda context.
148+
"""
149+
return publish_event(
150+
process_document(json.loads(record.body))
151+
)
152+
153+
154+
@logger.inject_lambda_context()
155+
@tracer.capture_lambda_handler
156+
@event_source(data_class=SQSEvent)
157+
def lambda_handler(event: SQSEvent, context: LambdaContext):
158+
"""
159+
Processes each SQS records with partial failure handling.
160+
:param event: The SQS event to process.
161+
:param context: The Lambda context.
162+
"""
163+
return process_partial_response(
164+
event=event,
165+
record_handler=record_handler,
166+
processor=processor,
167+
context=context
168+
)
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# Copyright (C) 2023 Amazon.com, Inc. or its affiliates.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import os
16+
import boto3
17+
import json
18+
19+
from aws_lambda_powertools import Logger
20+
21+
# Runtime function attributes.
22+
sns_client = boto3.client('sns')
23+
24+
# Environment variables.
25+
SERVICE_NAME = os.getenv('POWERTOOLS_SERVICE_NAME')
26+
SNS_TARGET_TOPIC = os.getenv('SNS_TARGET_TOPIC')
27+
28+
# Runtime function attributes.
29+
logger = Logger(service=SERVICE_NAME)
30+
31+
def publish_event(event: dict):
32+
"""
33+
Publish the given event to the SNS topic
34+
for the next middleware to process.
35+
:param event: The event to publish.
36+
"""
37+
38+
# Update the call stack with the current service name.
39+
event['data']['callStack'].insert(0, SERVICE_NAME)
40+
41+
if SNS_TARGET_TOPIC:
42+
logger.info(event)
43+
44+
# Publish the event to the SNS topic.
45+
sns_client.publish(
46+
TopicArn=SNS_TARGET_TOPIC,
47+
Message=json.dumps(event)
48+
)
49+
50+
return (event)
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
boto3
2+
aws_lambda_powertools
3+
aws-xray-sdk
4+
imagehash==4.3.1
5+
pillow==10.3.0
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{
2+
"extends": "../../../../tsconfig.json",
3+
"compilerOptions": {
4+
"outDir": "./dist",
5+
"rootDir": "./src"
6+
},
7+
"include": ["./src/**/*"]
8+
}

‎packages/middlewares/image-processors/laplacian-image-processor/src/lambdas/processor/index.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,7 @@ def laplacian_variance(image, depth=cv2.CV_64F, ksize=3):
9393

9494
def process_document(event: dict) -> dict:
9595
"""
96-
Converts the document associated with the given cloud event
97-
to plain text and publishes the result to the next middlewares.
96+
Computes the Laplacian variance for the given document.
9897
:param event: the received cloud event.
9998
"""
10099

‎packages/typescript-sdk/src/models/document/metadata/image/attributes/hashes.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,14 @@ export const HashesSchema = z.object({
7474
wavelet: z
7575
.string()
7676
.describe('The wavelet hash of the image.')
77+
.optional(),
78+
79+
/**
80+
* The color hash of the image.
81+
*/
82+
color: z
83+
.string()
84+
.describe('The color hash of the image.')
7785
.optional()
7886
});
7987

0 commit comments

Comments
 (0)
Please sign in to comment.