Skip to content

Commit 34939b9

Browse files
authored
Initial Pr (#1)
Initial Pr
1 parent 8ca085f commit 34939b9

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+34816
-0
lines changed

.dockerignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
data/
2+
workshop_infra/
3+
!workshop_infra/scripts/
4+
5+
6+

.gitignore

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,3 +127,18 @@ dmypy.json
127127

128128
# Pyre type checker
129129
.pyre/
130+
131+
132+
data/
133+
134+
135+
workshop_infra/cert/*
136+
workshop_infra/config.yaml
137+
workshop_infra/key_file.json
138+
139+
*.db
140+
tmp/
141+
.DS_Store
142+
143+
!/**/.gitkeep
144+
workshop_infra/keyfile.json

Dockerfile

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
2+
#FROM jupyter/scipy-notebook:python-3.10.6
3+
FROM jupyter/scipy-notebook:python-3.7.12
4+
5+
6+
7+
USER root
8+
9+
10+
RUN apt-get update && apt-get --yes install apt-utils && \
11+
apt-get --yes --no-install-recommends install htop tmux graphviz curl build-essential libsasl2-dev gfortran && \
12+
apt-get clean;
13+
14+
15+
# set the user back to original setting
16+
USER $NB_UID
17+
18+
19+
20+
# Install from requirements.txt file
21+
COPY --chown=${NB_UID}:${NB_GID} environment.yaml /tmp/
22+
23+
RUN mamba env update -n base -f /tmp/environment.yaml && \
24+
fix-permissions "${CONDA_DIR}" && \
25+
fix-permissions "/home/${NB_USER}"
26+
27+
COPY --chown=${NB_UID}:${NB_GID} workshop_infra/scripts /tmp/scripts/
28+
29+
USER root
30+
31+
RUN bash /tmp/scripts/build_setup_root.sh
32+
USER $NB_UID
33+
34+
35+
COPY --chown=${NB_UID}:${NB_GID} notebooks/workshop_setup.ipynb /tmp/workshop/notebooks/
36+
37+
RUN bash /tmp/scripts/build_setup_user.sh
38+
39+
COPY --chown=${NB_UID}:${NB_GID} . /tmp/workshop/
40+
41+
ENV PATH="/opt/google-cloud-sdk/bin:${PATH}"
42+
43+
44+
#COPY --chown=${NB_UID}:${NB_GID} docker-setup.sh /tmp/
45+
46+
#COPY --chown=${NB_UID}:${NB_GID} setup.ipynb /tmp/
47+
48+
# RUN papermill /tmp/setup.ipynb /tmp/setup__out.ipynb -k python3 --log-output --log-level INFO --progress-bar && \
49+
# fix-permissions "${CONDA_DIR}" && \
50+
# fix-permissions "/home/${NB_USER}"

assets/all_assets.sw

2.99 MB
Binary file not shown.

data/.gitkeep

Whitespace-only changes.

docker-compose.yaml

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
version: "3.0"
2+
services:
3+
elasticsearch:
4+
container_name: es-container
5+
image: docker.elastic.co/elasticsearch/elasticsearch:8.4.3
6+
environment:
7+
- xpack.security.enabled=false
8+
- "discovery.type=single-node"
9+
ports:
10+
- 9200:9200
11+
volumes:
12+
- esdata:/usr/share/elasticsearch/data
13+
14+
# milvus:
15+
# container_name: milvus
16+
# image: milvusdb/milvus:1.1.1-cpu-d061621-330cc6
17+
# ports:
18+
# - 19530:19530
19+
# - 19121:19121
20+
# volumes:
21+
# - milvusdata:/var/lib/milvus
22+
23+
24+
milvus:
25+
container_name: milvus
26+
build:
27+
context: docker_milvus
28+
ports:
29+
- 19530:19530
30+
- 19121:19121
31+
volumes:
32+
- milvusdata:/var/lib/milvus
33+
34+
35+
weaviate:
36+
image: semitechnologies/weaviate:1.14.0
37+
ports:
38+
- 8081:8080
39+
environment:
40+
QUERY_DEFAULTS_LIMIT: 25
41+
AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
42+
PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
43+
DEFAULT_VECTORIZER_MODULE: 'none'
44+
ENABLE_MODULES: ''
45+
CLUSTER_HOSTNAME: 'node1'
46+
volumes:
47+
- weaviatedata:/var/lib/weaviate
48+
volumes:
49+
esdata:
50+
weaviatedata:
51+
milvusdata:

docker_milvus/Dockerfile

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
FROM ubuntu:18.04
2+
3+
ENV MILVUS_VERSION "2.1.4-1"
4+
5+
COPY install.sh /tmp/install.sh
6+
7+
RUN bash /tmp/install.sh
8+
9+
10+
# ARG S6_OVERLAY_VERSION=3.1.2.1
11+
12+
13+
# ADD https://github.com/just-containers/s6-overlay/releases/download/v${S6_OVERLAY_VERSION}/s6-overlay-noarch.tar.xz /tmp
14+
15+
# RUN tar -C / -Jxpf /tmp/s6-overlay-noarch.tar.xz
16+
17+
# ADD https://github.com/just-containers/s6-overlay/releases/download/v${S6_OVERLAY_VERSION}/s6-overlay-x86_64.tar.xz /tmp
18+
19+
# RUN tar -C / -Jxpf /tmp/s6-overlay-x86_64.tar.xz
20+
21+
22+
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
23+
24+
25+
26+
CMD ["/usr/bin/supervisord"]

docker_milvus/install.sh

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
apt-get update -y
2+
3+
apt install software-properties-common -y
4+
5+
add-apt-repository ppa:milvusdb/milvus
6+
7+
8+
apt-get update -y
9+
10+
11+
apt-get install "milvus=$MILVUS_VERSION" -y
12+
13+
14+
15+
#mkdir -p /etc/services.d/system/
16+
17+
#cp /lib/systemd/system/milvus* /etc/services.d/system/
18+
19+
#cp /lib/systemd/system/milvus* /etc/services.d/system/
20+
21+
#COPY resources/docker/services.d /etc/services.d
22+
23+
24+
25+
apt-get update && apt-get install -y supervisor
26+
mkdir -p /var/log/supervisor

docker_milvus/readme.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
https://github.com/just-containers/s6-overlay
2+
3+
4+
5+
cat /etc/services.d/system/milvus-etcd.service
6+
ExecStart=/usr/bin/milvus-etcd --data-dir /var/lib/milvus/etcd-data
7+
8+
9+
cat /etc/services.d/system/milvus-minio.service
10+
ExecStart=/usr/bin/milvus-minio server /var/lib/milvus/minio-data
11+
12+
13+
14+
cat /etc/services.d/system/milvus.service
15+
16+
Environment=MILVUSCONF=/etc/milvus/configs/
17+
ExecStart=/usr/bin/milvus run standalone
18+
19+
20+
21+
https://gdevillele.github.io/engine/admin/using_supervisord/

docker_milvus/supervisord.conf

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
[supervisord]
2+
nodaemon=true
3+
4+
[program:milvus-minio]
5+
command=/usr/bin/milvus-minio server /var/lib/milvus/minio-data
6+
7+
[program:milvus-etcd]
8+
command=/usr/bin/milvus-etcd --data-dir /var/lib/milvus/etcd-data
9+
10+
11+
[program:milvus]
12+
environment=MILVUSCONF=/etc/milvus/configs/
13+
command=/usr/bin/milvus run standalone

docs/internal_notes.md

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Internal Notes
2+
3+
## Setup Dep
4+
5+
Install other deps
6+
7+
```bash
8+
sudo apt update && sudo apt install -y p7zip-full
9+
```
10+
11+
Create conda environemnt
12+
13+
```bash
14+
conda create -n stackoverflow python=3.7 mamba
15+
conda activate stackoverflow
16+
mamba env update -n stackoverflow -f environment.yaml
17+
# mamba install anaconda jupyter ipykernel nb_conda_kernels
18+
19+
mamba install ipython ipykernel nb_conda_kernels
20+
21+
ipython kernel install --user --name=stackoverflow
22+
23+
24+
conda create --name stackoverflow --clone base
25+
26+
```
27+
28+
Start ES/ Faiss for local dev
29+
30+
```bash
31+
docker-compose up
32+
```
33+
34+
```bash
35+
docker run --user root -e GRANT_SUDO=yes -it app bash
36+
```
37+
38+
39+
40+
```
41+
42+
```

docs/slide_notes.md

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
PUT /items
2+
3+
```json
4+
{
5+
"mappings": {
6+
"properties": {
7+
"title": { "type": "text" },
8+
"description": { "type": "text" },
9+
10+
"brand": { "type": "keyword" },
11+
"product_type": { "type": "keyword" },
12+
13+
"price": { "type": "double" }
14+
}
15+
}
16+
}
17+
```
18+
19+
Nike shoe under 100$
20+
21+
GET /items/_search
22+
23+
```json
24+
{
25+
"query": {
26+
27+
"multi_match": {
28+
"query": "Nike shoe under 100$",
29+
"fields": ["title^2", "Description^1"]
30+
}
31+
32+
,"bool": {
33+
"filter": [
34+
{ "term": { "brand": "nike" }}
35+
]
36+
}
37+
,"filtered": {
38+
"filter": {
39+
"range": {
40+
"price" : { "lte": 100 }
41+
}
42+
}
43+
}
44+
}
45+
46+
```
47+
48+
49+
50+
## PR curve
51+
```
52+
Recall Perfect Classifier Baseline Classifier Good Classifier High Precision
53+
0.1 0.95 0.5 0.9 0.91
54+
0.2 0.95 0.5 0.85 0.91
55+
0.3 0.95 0.5 0.85 0.91
56+
0.4 0.95 0.5 0.8 0.9
57+
0.5 0.95 0.5 0.8 0.4
58+
0.6 0.95 0.5 0.8 0.4
59+
0.7 0.95 0.5 0.8 0.4
60+
0.8 0.95 0.5 0.8 0.2
61+
0.9 0.95 0.5 0.7 0.2
62+
1 0.9 0.5 0.2 0.1
63+
```
64+
65+
66+
67+
68+
dcg
69+
70+
```
71+
Discounted\space Cumulative\space Gain
72+
= \sum_{1}^{p}\frac{ relevance (i)}{log_{2}(i+1)}
73+
74+
75+
\\
76+
DCG = {\color{Green}\frac{3}{log_{2}(2)} } + \frac{1}{log_{2}(3)} + {\color{Red}\frac{0}{log_{2}(4)} }+\frac{2}{log_{2}(5)} = 4.49
77+
78+
\\
79+
80+
(Ideal)\space DCG = {\color{Green}\frac{3}{log_{2}(2)} } + \frac{2}{log_{2}(3)} + \frac{1}{log_{2}(4)} + {\color{Red}\frac{0}{log_{2}(5)} } = 5.88
81+
82+
\\
83+
Normalized\space Discounted\space Cumulative\space Gain
84+
= \frac{ DCG}{Ideal\space DCG} = \frac{4.49}{5.88}
85+
```

environment.yaml

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#name: workshop
2+
channels:
3+
- conda-forge
4+
dependencies:
5+
- python==3.7.*
6+
- pip
7+
- mamba
8+
- nb_conda_kernels
9+
- pyarrow==9.0.*
10+
- lxml==4.9.*
11+
- pip:
12+
- google-cloud-bigquery-storage
13+
- elasticsearch==8.4.*
14+
- weaviate-client==3.8.0
15+
- pandarallel==1.6.*
16+
- milvus==2.1.*
17+
- rich==12.6.*
18+
- jupyterlab-execute-time==2.1.*
19+
- sentence-transformers==2.2.*
20+
- ipywidgets==8.0.*
21+
- pyarrow==10.0.*
22+
- gcsfs==2022.10.*
23+
- papermill==2.3.*
24+
- rank_bm25==0.2.*
25+
- faiss_cpu==1.7.*
26+
27+
28+
29+
#
30+
#- google-cloud-bigquery-storage==2.16.*
31+
#- modin[ray]
32+
# - pymilvus==1.1.0
33+
#- pandas-gbq==0.17.*
34+
#- git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[faiss]

0 commit comments

Comments
 (0)