Skip to content

Commit b1acee5

Browse files
committed
Restructure
1 parent e8aed84 commit b1acee5

File tree

3 files changed

+82
-0
lines changed

3 files changed

+82
-0
lines changed

requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
beautifulsoup4
2+
requests
3+
argparse

src/main.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
from bs4 import BeautifulSoup
2+
import requests
3+
import argparse
4+
import sys
5+
6+
7+
def get_soup() -> BeautifulSoup:
8+
headers: dict = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:136.0) Gecko/20100101 Firefox/136.0'}
9+
request = requests.get('https://www.theaustralian.com.au/news/latest-news', headers=headers)
10+
html: bytes = request.content
11+
12+
soup = BeautifulSoup(html, 'html.parser')
13+
return soup
14+
15+
def get_headlines(soup: BeautifulSoup) -> list[str]:
16+
headlines: set = set()
17+
18+
for h in soup.find_all('a', class_='storyblock_title_link'):
19+
headline: str = h.contents[0].lower()
20+
headlines.add(headline)
21+
22+
return sorted(headlines)
23+
24+
def check_headlines(headlines: list[str], term: str):
25+
term_list: list[str] = []
26+
terms_found: int = 0
27+
28+
for i, headline in enumerate( headlines, start=1):
29+
if term.lower() in headline:
30+
terms_found += 1
31+
term_list.append(headline)
32+
print(f'{i}: {headline.capitalize()} <------------------------------ "{term}"')
33+
else:
34+
print(f'{i}: {headline.capitalize()}')
35+
36+
print('--------------------------------------------------------------------------------------')
37+
if terms_found:
38+
print(f'"{term}" was mentioned {terms_found} times.')
39+
print('======================================================================================')
40+
41+
for i, headline in enumerate(term_list, start=1):
42+
print(f'{i}: {headline.capitalize()}')
43+
44+
else:
45+
print(f'No matches found for: "{term}"')
46+
print('--------------------------------------------------------------------------------------')
47+
#print('======================================================================================')
48+
49+
def main():
50+
51+
parser = argparse.ArgumentParser(prog='h2scraper', description='Scrape the news headlines from the Australian news sites')
52+
parser.add_argument('-s', '--scan')
53+
h2s_commands = parser.parse_args(sys.argv[1:])
54+
print(h2s_commands.scan)
55+
56+
57+
soup: BeautifulSoup = get_soup()
58+
headlines: list[str] = get_headlines(soup=soup)
59+
60+
sites = [
61+
{'url': 'https://www.theaustralian.com.au/news/latest-news', 'class': 'storyblock_title_link'},
62+
{'url': 'https://www.fedcourt.gov.au/news-and-events', 'class': ''},
63+
{'url': 'https://www.austlii.edu.au/', 'class': ''},
64+
{'url': 'https://asic.gov.au/newsroom', 'class': ''}
65+
]
66+
67+
68+
check_headlines(headlines, h2s_commands.scan)
69+
70+
if __name__ == '__main__':
71+
main()
72+
73+
# if we type ai we will get every word containing 'ai' like mountains
74+
# we just searching combination inside strings
75+
# make it more accurate
76+
# add interaction with user
77+
# search more sites
78+

venv/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
!.gitignore

0 commit comments

Comments
 (0)