Skip to content

Commit f1b308a

Browse files
authored
changes in readme (#2)
* added slides * added preview notebook
1 parent 34939b9 commit f1b308a

File tree

6 files changed

+348
-75
lines changed

6 files changed

+348
-75
lines changed

assets/slides_odsc2022.pdf

7.8 MB
Binary file not shown.

docs/internal_notes.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,15 @@ docker run --user root -e GRANT_SUDO=yes -it app bash
3737

3838

3939

40+
```
41+
Go to DIR: /projects/search-engine-workshop
42+
Type: docker-compose up
43+
44+
In the notebooks test... checks the milvus and elastic connections
45+
46+
47+
```
48+
gsutil -m cp -r gs://np-training-tmp/stackoverflow/final* gs://np-public-training-temp/stackoverflow/
4049
```
4150
4251
```

notebooks/01_workshop_data_preview.ipynb

Lines changed: 257 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,14 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 7,
5+
"execution_count": 2,
66
"id": "724dc187-f812-4c97-81dd-ad527f9d8338",
77
"metadata": {},
88
"outputs": [],
99
"source": [
1010
"import pandas as pd\n",
11-
"from IPython.display import JSON\n"
11+
"from IPython.display import JSON\n",
12+
"import metrics_utils"
1213
]
1314
},
1415
{
@@ -628,6 +629,260 @@
628629
"metadata": {},
629630
"outputs": [],
630631
"source": []
632+
},
633+
{
634+
"cell_type": "code",
635+
"execution_count": null,
636+
"id": "473e7e29-7a27-4030-aad3-c60c89dc19bd",
637+
"metadata": {},
638+
"outputs": [],
639+
"source": []
640+
},
641+
{
642+
"cell_type": "code",
643+
"execution_count": null,
644+
"id": "ce7fc618-3b9c-450e-a89f-576d47fba15e",
645+
"metadata": {},
646+
"outputs": [],
647+
"source": []
648+
},
649+
{
650+
"cell_type": "code",
651+
"execution_count": null,
652+
"id": "30a57006-3696-4a2d-82ca-726ee7c5b6b3",
653+
"metadata": {},
654+
"outputs": [],
655+
"source": []
656+
},
657+
{
658+
"cell_type": "markdown",
659+
"id": "29ebbeec-f1de-4d07-b603-917e5aa3928b",
660+
"metadata": {},
661+
"source": [
662+
"## Metrics"
663+
]
664+
},
665+
{
666+
"cell_type": "code",
667+
"execution_count": 4,
668+
"id": "3c824225-1fe7-488a-a291-f8ade3f82a82",
669+
"metadata": {},
670+
"outputs": [
671+
{
672+
"data": {
673+
"text/plain": [
674+
"\u001b[0;31mType:\u001b[0m module\n",
675+
"\u001b[0;31mString form:\u001b[0m <module 'metrics_utils' from '/home/jupyter/projects/search-engine-workshop/notebooks/metrics_utils.py'>\n",
676+
"\u001b[0;31mFile:\u001b[0m ~/projects/search-engine-workshop/notebooks/metrics_utils.py\n",
677+
"\u001b[0;31mSource:\u001b[0m \n",
678+
"\u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\n",
679+
"\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n",
680+
"\u001b[0;34m\u001b[0m\u001b[0;32mdef\u001b[0m \u001b[0mprecision_at_k\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
681+
"\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
682+
"\u001b[0;34m\u001b[0m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\n",
683+
"\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
684+
"\u001b[0;34m\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mk\u001b[0m \u001b[0;34m\u001b[0m\n",
685+
"\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
686+
"\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n",
687+
"\u001b[0;34m\u001b[0m\u001b[0;32mdef\u001b[0m \u001b[0mmean_reciprocal_rank\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
688+
"\u001b[0;34m\u001b[0m \u001b[0mmrr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\n",
689+
"\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
690+
"\u001b[0;34m\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
691+
"\u001b[0;34m\u001b[0m \u001b[0mfirst_index\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
692+
"\u001b[0;34m\u001b[0m \u001b[0mmrr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mfirst_index\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
693+
"\u001b[0;34m\u001b[0m \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
694+
"\u001b[0;34m\u001b[0m \u001b[0;32mpass\u001b[0m\u001b[0;34m\u001b[0m\n",
695+
"\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
696+
"\u001b[0;34m\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mmrr\u001b[0m\u001b[0;34m\u001b[0m\n",
697+
"\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n",
698+
"\u001b[0;34m\u001b[0m\u001b[0;32mdef\u001b[0m \u001b[0maverage_precision\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
699+
"\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n",
700+
"\u001b[0;34m\u001b[0m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\n",
701+
"\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
702+
"\u001b[0;34m\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0midx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
703+
"\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
704+
"\u001b[0;34m\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
705+
"\u001b[0;34m\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mprecision_at_k\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0midx\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
706+
"\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
707+
"\u001b[0;34m\u001b[0m \u001b[0map\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\n",
708+
"\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
709+
"\u001b[0;34m\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
710+
"\u001b[0;34m\u001b[0m \u001b[0map\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
711+
"\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
712+
"\u001b[0;34m\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m\u001b[0m\n",
713+
"\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
714+
"\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n",
715+
"\u001b[0;34m\u001b[0m\u001b[0;32mdef\u001b[0m \u001b[0mall_metrics\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
716+
"\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
717+
"\u001b[0;34m\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m\u001b[0m\n",
718+
"\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n",
719+
"\u001b[0;34m\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m\u001b[0m\n",
720+
"\u001b[0;34m\u001b[0m \u001b[0;34m\"p@1\"\u001b[0m \u001b[0;34m:\u001b[0m \u001b[0mprecision_at_k\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
721+
"\u001b[0;34m\u001b[0m \u001b[0;34m,\u001b[0m \u001b[0;34m\"p@5\"\u001b[0m \u001b[0;34m:\u001b[0m \u001b[0mprecision_at_k\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
722+
"\u001b[0;34m\u001b[0m \u001b[0;34m,\u001b[0m \u001b[0;34m\"p@10\"\u001b[0m \u001b[0;34m:\u001b[0m \u001b[0mprecision_at_k\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
723+
"\u001b[0;34m\u001b[0m \u001b[0;34m,\u001b[0m \u001b[0;34m\"mrr\"\u001b[0m \u001b[0;34m:\u001b[0m \u001b[0mmean_reciprocal_rank\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
724+
"\u001b[0;34m\u001b[0m \u001b[0;34m,\u001b[0m \u001b[0;34m\"map\"\u001b[0m \u001b[0;34m:\u001b[0m \u001b[0maverage_precision\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
725+
"\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
726+
"\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
727+
"\u001b[0;34m\u001b[0m \u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\n",
728+
"\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
729+
"\u001b[0;34m\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n"
730+
]
731+
},
732+
"metadata": {},
733+
"output_type": "display_data"
734+
}
735+
],
736+
"source": [
737+
"??metrics_utils"
738+
]
739+
},
740+
{
741+
"cell_type": "code",
742+
"execution_count": null,
743+
"id": "9b37a9b9-ab34-4152-88af-22728c8758a9",
744+
"metadata": {},
745+
"outputs": [],
746+
"source": []
747+
},
748+
{
749+
"cell_type": "markdown",
750+
"id": "90285ffa-4312-4ea8-84a6-595199688140",
751+
"metadata": {},
752+
"source": [
753+
"relevant result at the end"
754+
]
755+
},
756+
{
757+
"cell_type": "code",
758+
"execution_count": 11,
759+
"id": "356a2b4a-6f3d-42df-bf65-7796bc29c7d9",
760+
"metadata": {},
761+
"outputs": [
762+
{
763+
"data": {
764+
"text/plain": [
765+
"{'p@1': 0.0, 'p@5': 0.2, 'p@10': 0.1, 'mrr': 0.2, 'map': 0.2}"
766+
]
767+
},
768+
"execution_count": 11,
769+
"metadata": {},
770+
"output_type": "execute_result"
771+
}
772+
],
773+
"source": [
774+
"metrics_utils.all_metrics([0,0,0,0,1])"
775+
]
776+
},
777+
{
778+
"cell_type": "markdown",
779+
"id": "59f9f574-e506-45e0-9c4c-c65a2b3827eb",
780+
"metadata": {},
781+
"source": [
782+
"relevant result at the beginning"
783+
]
784+
},
785+
{
786+
"cell_type": "code",
787+
"execution_count": 12,
788+
"id": "8252bfbc-7184-437b-91e6-b60d166a9742",
789+
"metadata": {},
790+
"outputs": [
791+
{
792+
"data": {
793+
"text/plain": [
794+
"{'p@1': 1.0, 'p@5': 0.2, 'p@10': 0.1, 'mrr': 1.0, 'map': 1.0}"
795+
]
796+
},
797+
"execution_count": 12,
798+
"metadata": {},
799+
"output_type": "execute_result"
800+
}
801+
],
802+
"source": [
803+
"metrics_utils.all_metrics([1,0,0,0,0])"
804+
]
805+
},
806+
{
807+
"cell_type": "code",
808+
"execution_count": 13,
809+
"id": "196acac3-a263-4307-8ef9-075e7492870c",
810+
"metadata": {},
811+
"outputs": [
812+
{
813+
"data": {
814+
"text/plain": [
815+
"0.2"
816+
]
817+
},
818+
"execution_count": 13,
819+
"metadata": {},
820+
"output_type": "execute_result"
821+
}
822+
],
823+
"source": []
824+
},
825+
{
826+
"cell_type": "markdown",
827+
"id": "e3b1413a-81a2-4a7d-9a46-ac6c9938b17e",
828+
"metadata": {},
829+
"source": [
830+
"map captures that the relevant results are shown at the beginning"
831+
]
832+
},
833+
{
834+
"cell_type": "code",
835+
"execution_count": 14,
836+
"id": "ad52c3ad-952a-4340-87bd-d20369cb420d",
837+
"metadata": {},
838+
"outputs": [
839+
{
840+
"data": {
841+
"text/plain": [
842+
"{'p@1': 0.0,\n",
843+
" 'p@5': 0.4,\n",
844+
" 'p@10': 0.2,\n",
845+
" 'mrr': 0.3333333333333333,\n",
846+
" 'map': 0.41666666666666663}"
847+
]
848+
},
849+
"execution_count": 14,
850+
"metadata": {},
851+
"output_type": "execute_result"
852+
}
853+
],
854+
"source": [
855+
"metrics_utils.all_metrics([0,0,1,1,0])"
856+
]
857+
},
858+
{
859+
"cell_type": "code",
860+
"execution_count": 15,
861+
"id": "8f61fc8e-7292-43dc-8f29-501d7fee8876",
862+
"metadata": {},
863+
"outputs": [
864+
{
865+
"data": {
866+
"text/plain": [
867+
"{'p@1': 1.0, 'p@5': 0.4, 'p@10': 0.2, 'mrr': 1.0, 'map': 1.0}"
868+
]
869+
},
870+
"execution_count": 15,
871+
"metadata": {},
872+
"output_type": "execute_result"
873+
}
874+
],
875+
"source": [
876+
"metrics_utils.all_metrics([1,1,0,0,0])"
877+
]
878+
},
879+
{
880+
"cell_type": "code",
881+
"execution_count": null,
882+
"id": "8505c6aa-d009-4dea-9263-38ca4d9f2c4b",
883+
"metadata": {},
884+
"outputs": [],
885+
"source": []
631886
}
632887
],
633888
"metadata": {

readme.md

Lines changed: 11 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -28,25 +28,30 @@ Internal notebooks that show how to fetch a dump of the Stack Overflow XML
2828
[notebook](notebooks/01_b_setup.ipynb)
2929
Process the XML dump and save to smaller parquet files
3030

31-
2. Non Deep Learning Retrieval
31+
3. Non Deep Learning Retrieval
3232

33-
[Link](notebooks/02_indexing_es.ipynb)
33+
[Link](notebooks/02_retrieval_sparse.ipynb)
3434

3535
Shows how to index and retrieve documents using ElasticSearch
3636

37-
3. Deep Learning Retrieval
37+
4. Deep Learning Retrieval
3838

3939
Show how to index and retrieves documents using a finetuned Deep Learning Retriever
40-
[Link](notebooks/02_indexing_faiss.ipynb)
40+
[Link](notebooks/02_retrieval_dense_milvus.ipynb)
4141

42-
4. ANN
42+
Sample notebook for scross encoder taken from SentenceTransformer docs
43+
[Link](notebooks/other__retrieve_rerank_simple_wikipedia.ipynb)
44+
45+
5. ANN
4346
Shows how to speed up Deep Learning retrieval by exploring different ANN indexes
47+
[Link](notebooks/ann_benchmark_recall.ipynb)
48+
4449

4550

4651

4752
## Slides
4853

49-
[Slides][assets/slides.pdf)
54+
[ODSC 2022 Slides][assets/slides_odsc2022.pdf)
5055

5156

5257
## Contact
@@ -59,29 +64,4 @@ For help or feedback, please reach out to :
5964

6065

6166

62-
## Other
63-
- compare against tf-idf
64-
- elastic search phrase match
65-
- elastic search autocomplete / spellcheck / facets
66-
- weaviate indexing
67-
- faiss ann indexes
68-
- faiss ann retrieval / recall slowness
69-
70-
- backup elastic search cluster
71-
- show individual document score
72-
- show how document score changes a bit when hitting all shards
73-
74-
- show examples of similar embedding
75-
- how to deal with long passages
76-
- u may not need semantic search, might get away eith doc2query
77-
78-
79-
Go to DIR: /projects/search-engine-workshop
80-
Type: docker-compose up
81-
82-
In the notebooks test... checks the milvus and elastic connections
83-
8467

85-
```
86-
gsutil -m cp -r gs://np-training-tmp/stackoverflow/final* gs://np-public-training-temp/stackoverflow/
87-
```

0 commit comments

Comments
 (0)