Skip to content

Commit 2b7bbb7

Browse files
committed
traverse section levels of the collection and saves them relationally
1 parent 0cbdb22 commit 2b7bbb7

File tree

2 files changed

+106
-9
lines changed

2 files changed

+106
-9
lines changed

api_downloader.py

Lines changed: 79 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from sqlalchemy import create_engine
1111
from sqlalchemy.orm import Session, sessionmaker
1212

13-
from models.collection import Base, Collection
13+
from models.collection import Base, Collection, Section
1414

1515
API_SEARCH = "https://www.ancestry.com/search/collections/catalog/api/search"
1616

@@ -71,7 +71,7 @@ def random_sleep(min_sec: float = 1, max_sec: float = 30, log=True, factor=1):
7171
"""
7272
min_sec = min_sec * factor
7373
max_sec = max_sec * factor
74-
sleep_time = random.randint(min_sec * 1000, max_sec * 1000) / 1000
74+
sleep_time = random.randint(int(min_sec * 1000), int(max_sec * 1000)) / 1000
7575
if log:
7676
print(f"Waiting for {sleep_time}s")
7777
time.sleep(sleep_time)
@@ -93,6 +93,10 @@ def __init__(self, user: str, pas: str):
9393
:param pas: Ancestry.com password
9494
"""
9595
self._session = requests.Session()
96+
self._session.headers.update({
97+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) '
98+
'Chrome/39.0.2171.95 Safari/537.36 '
99+
})
96100
self._username = user
97101
self._password = pas
98102
self._login()
@@ -101,19 +105,21 @@ def _login(self):
101105
"""Authenticate with the ancestry API"""
102106
if not self._authenticated:
103107
print("Logging in")
104-
self._session.post("https://www.ancestry.com/account/signin/frame/authenticate",
105-
{"username": self._username, "password": self._password})
106-
self._authenticated = True
108+
login_body = {"username": self._username, "password": self._password}
109+
response = self._session.post("https://www.ancestry.com/account/signin/frame/authenticate",
110+
data=login_body,
111+
headers={"referer": "https://www.ancestry.com/account/signin/frame?"})
112+
self._authenticated = response.status_code == 200
107113

108114
def _init_db_engine(self):
109115
self._db_engine = create_engine(f'sqlite:///{self._sqlite_file}', echo=True)
110116
Base.metadata.create_all(self._db_engine)
111117
self._engine_initialized = True
112118

113-
def _get_db_session(self) -> Session:
119+
def _get_db_session(self, autocommit=False) -> Session:
114120
if not self._engine_initialized:
115121
self._init_db_engine()
116-
return sessionmaker(bind=self._db_engine)()
122+
return sessionmaker(bind=self._db_engine, autocommit=autocommit)()
117123

118124
def _get_metadata_target(self) -> int:
119125
""" Selects next target for updating collection metadata
@@ -244,6 +250,70 @@ def get_metadata_loop(self, limit_seconds=600):
244250
if time.time() - started > limit_seconds:
245251
break
246252

253+
def get_browse_values(self, dbid: int):
254+
db_session = self._get_db_session()
255+
exists_query = db_session.query(Collection).filter_by(collection_id=dbid)
256+
257+
def format_url(path: List[str] = None) -> str:
258+
url = f"https://www.ancestry.com/imageviewer/api/media/browse-elements?dbId={dbid}"
259+
if path is not None:
260+
url += f"&path={'|'.join(path)}"
261+
return url
262+
263+
def get_children(path: List[str] = None) -> List[Section]:
264+
children = []
265+
if path is None:
266+
path = []
267+
print(path)
268+
url = format_url(path=path)
269+
response = self._session.get(url).json()["browseElement"]
270+
for x2 in response["BrowseSubElements"]:
271+
value2, locale_value2, description2 = x2["PathValue"], x2["LocalizedPathValue"], None
272+
if "PathDescription" in x2.keys():
273+
description2 = x2["PathDescription"]
274+
child_section = Section(value=value2, locale_value=locale_value2,
275+
has_child_levels=response["ContainsChildLevels"])
276+
if description2:
277+
child_section.description = description2
278+
if response["ContainsChildLevels"]:
279+
new_path = path.copy()
280+
new_path.append(value2)
281+
child_section.children = get_children(new_path)
282+
children.append(
283+
child_section
284+
)
285+
return children
286+
287+
if exists_query.scalar():
288+
collection = exists_query.first()
289+
incompatible_collections = ["Dictionaries, Encyclopedias & Reference"]
290+
collection.sections = []
291+
292+
res = self._session.get(format_url())
293+
print(res.text)
294+
browse_element = res.json()["browseElement"]
295+
has_children = browse_element["ContainsChildLevels"]
296+
index = 1
297+
298+
for x in browse_element["BrowseSubElements"]:
299+
value, locale_value, description = x["PathValue"], x["LocalizedPathValue"], None
300+
if "PathDescription" in x.keys():
301+
description = x["PathDescription"]
302+
303+
print(f"section {index} of {len(browse_element['BrowseSubElements'])}")
304+
index += 1
305+
section = Section(value=value, locale_value=locale_value, has_child_levels=has_children)
306+
if description:
307+
section.description = description
308+
if has_children:
309+
print("getting children for top section")
310+
section.children = get_children([section.value])
311+
312+
collection.sections.append(section)
313+
db_session.commit()
314+
db_session.commit()
315+
db_session.close()
316+
247317

248318
if __name__ == '__main__':
249319
dotenv.load_dotenv()
@@ -257,3 +327,5 @@ def get_metadata_loop(self, limit_seconds=600):
257327
# controller.load_collections_into_db_from_disk()
258328

259329
# controller.get_metadata_loop()
330+
331+
print(controller.get_browse_values(int(input(">>"))))

models/collection.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
11
from datetime import datetime
22
from typing import List
33

4-
from sqlalchemy import Boolean, Column, Integer, Sequence, String, TIMESTAMP
4+
from sqlalchemy import Boolean, Column, ForeignKey, Integer, Sequence, String, TIMESTAMP
55
from sqlalchemy.ext.declarative import declarative_base
6+
from sqlalchemy.orm import relationship
67

78
Base = declarative_base()
89

910

1011
class Collection(Base):
1112
"""An ancestry.com record collection"""
1213
__tablename__ = 'collection'
13-
id = Column(Integer, Sequence('collection_section_id_seq'), primary_key=True)
14+
id = Column(Integer, Sequence('collection_id_seq'), primary_key=True)
1415
# Ancestry collection id
1516
collection_id = Column(Integer, nullable=False, index=True, unique=True)
1617
# name (ie 1790 census)
@@ -44,6 +45,8 @@ class Collection(Base):
4445
time_created = Column(TIMESTAMP, default=datetime.utcnow, nullable=False)
4546
time_updated = Column(TIMESTAMP, onupdate=datetime.utcnow, nullable=False, default=datetime.utcnow)
4647

48+
sections = relationship("Section")
49+
4750
def __repr__(self):
4851
return "<Collection(id='%s')>" % self.collection_id
4952

@@ -52,3 +55,25 @@ def get_levels(self) -> List[str]:
5255

5356
def set_levels(self, levels: List[str]):
5457
self.navigation_levels = "|".join(levels)
58+
59+
60+
class Section(Base):
61+
"""A breadcrumb/browse section for a collection breakdown"""
62+
__tablename__ = "section"
63+
id = Column(Integer, Sequence('section_id_seq'), primary_key=True)
64+
value = Column(String(256))
65+
locale_value = Column(String(256))
66+
description = Column(String(1024))
67+
68+
has_child_levels = Column(Boolean)
69+
70+
collection = Column(Integer, ForeignKey('collection.id'), nullable=True)
71+
72+
parent_id = Column(Integer, ForeignKey('section.id'), nullable=True)
73+
children = relationship("Section")
74+
75+
time_created = Column(TIMESTAMP, default=datetime.utcnow, nullable=False)
76+
time_updated = Column(TIMESTAMP, onupdate=datetime.utcnow, nullable=False, default=datetime.utcnow)
77+
78+
def __repr__(self):
79+
return "<Section(id='%s')>" % self.value

0 commit comments

Comments
 (0)