I have been writing a monitoring script where I check for whenever there is changes on a webpage. When a change has happend I want to be notified by printing out that there is a difference.
I have also written a spam filter function that could be caused due to a cache issue where a repo count can go from a old count to a new and back... Here is the code
import sys
import threading
import time
from datetime import datetime, timedelta
from typing import Union
import requests
from bs4 import BeautifulSoup
from loguru import logger
URLS: set = set()
def filter_spam(delay: int, sizes: dict, _requests) -> Union[dict, bool]:
"""Filter requests to only those that haven't been made
previously within our defined cooldown period."""
# Get filtered set of requests.
def evaluate_request(r):
return r not in _requests or datetime.now() - _requests[r] >= timedelta(seconds=delay)
if filtered := [r for r in sizes if evaluate_request(r)]:
# Refresh timestamps for requests we're actually making.
for r in filtered:
_requests[r] = datetime.now()
return _requests
return False
class MonitorProduct:
def __init__(self):
self._requests: dict[str, datetime] = {}
self.previous_state = {}
def doRequest(self, url):
while True:
if url not in URLS:
logger.info(f'Deleting url from monitoring: {url}')
sys.exit()
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
if soup.find("span", {"data-search-type": "Repositories"}): # if there are sizes
self.compareData({
'title': soup.find("input", {"name": "q"})['value'],
'repo_count': {soup.find("span", {"data-search-type": "Repositories"}).text.strip(): None}
})
else:
logger.info(f"No diff for {url}")
else:
logger.info(f"Error for {url} -> {response.status_code}")
time.sleep(30)
def compareData(self, data):
if self.previous_state != data:
if filter_spam(3600, data['repo_count'], self._requests):
logger.info(f"The data has been changed to {data}")
self.previous_state = data
# mocked database
def database_urls() -> set:
return {
'https://github.com/search?q=hello+world',
'https://github.com/search?q=python+3',
'https://github.com/search?q=world',
'https://github.com/search?q=wth',
}
if __name__ == '__main__':
while True:
db_urls = database_urls() # get all urls from database
diff = db_urls - URLS
URLS = db_urls # Replace URLS with db_urls to get the latest urls
# Start the new URLS
for url in diff:
logger.info(f'Starting URL: {url}')
threading.Thread(target=MonitorProduct().doRequest, args=(url,)).start()
time.sleep(10)
The code is working pretty good however I have written a mocked database where it will constantly be pulled every 10s (If I need to higher it up, please explain to me why) - The reason I wrote a mocked data is that to be able to show you people how it is working. The idea will be to read from database (postgresql through peewee) - I also used Github as an example where it was easiest for me to get the values I want to compare against. I am aware that there is an API for Github but for my situation, I would like to get a better knowledge regarding beautifulsoup
therefore I use bs4
.
To mention it again, the point for me with this script is that I would like to get notified whenever something has been changed on the webpage.
I hope I could get a feedback from you lovely code reviewer!