Economist Old Edition
Economist Old Edition
Economist Old Edition
/usr/bin/env python
# License: GPLv3 Copyright: 2008, Kovid Goyal <kovid at kovidgoyal.net>
try:
from http.cookiejar import Cookie
except ImportError:
from cookielib import Cookie
import json
from html5_parser import parse
from lxml import etree
from collections import defaultdict
class JSONHasNoContent(ValueError):
pass
def load_article_from_json(raw, root):
# open('/t/raw.json', 'w').write(raw)
try:
data = json.loads(raw)['props']['pageProps']['content']
except KeyError as e:
raise JSONHasNoContent(e)
if isinstance(data, list):
data = data[0]
body = root.xpath('//body')[0]
for child in tuple(body):
body.remove(child)
article = E(body, 'article')
E(article, 'h4', data['subheadline'], style='color: red; margin: 0')
E(article, 'h1', data['headline'], style='font-size: x-large')
E(article, 'div', data['description'], style='font-style: italic')
E(article, 'div', (data['datePublishedString'] or '') + ' | ' +
(data['dateline'] or ''), style='color: gray; margin: 1em')
main_image_url = safe_dict(data, 'image', 'main', 'url').get('canonical')
if main_image_url:
div = E(article, 'div')
try:
E(div, 'img', src=main_image_url)
except Exception:
pass
for node in data.get('text') or ():
process_node(node, article)
def cleanup_html_article(root):
main = root.xpath('//main')[0]
body = root.xpath('//body')[0]
for child in tuple(body):
body.remove(child)
body.append(main)
main.set('id', '')
main.tag = 'article'
for x in root.xpath('//*[@style]'):
x.set('style', '')
for x in root.xpath('//button'):
x.getparent().remove(x)
def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})
class NoArticles(Exception):
pass
def process_url(url):
if url.startswith('/'):
url = 'https://www.economist.com' + url
return url
class Economist(BasicNewsRecipe):
needs_subscription = False
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
# Add a cookie indicating we have accepted Economist's cookie
# policy (needed when running from some European countries)
ck = Cookie(
version=0,
name='notice_preferences',
value='2:',
port=None,
port_specified=False,
domain='.economist.com',
domain_specified=False,
domain_initial_dot=True,
path='/',
path_specified=False,
secure=False,
expires=None,
discard=False,
comment=None,
comment_url=None,
rest={'HttpOnly': None},
rfc2109=False
)
br.cookiejar.set_cookie(ck)
br.set_handle_gzip(True)
return br
def publication_date(self):
if edition_date:
return parse_only_date(edition_date, as_utc=False)
url = self.browser.open("https://www.economist.com/printedition").geturl()
return parse_only_date(url.split("/")[-1], as_utc=False)
def parse_index(self):
# return [('Articles', [{'title':'test',
#
'url':'https://www.economist.com/interactive/briefing/2022/06/11/huge-foundation-
models-are-turbo-charging-ai-progress'
# }])]
if edition_date:
url = 'https://www.economist.com/weeklyedition/' + edition_date
self.timefmt = ' [' + edition_date + ']'
else:
url = 'https://www.economist.com/printedition'
# raw = open('/t/raw.html').read()
raw = self.index_to_soup(url, raw=True)
# with open('/t/raw.html', 'wb') as f:
# f.write(raw)
soup = self.index_to_soup(raw)
# nav = soup.find(attrs={'class':'navigation__wrapper'})
# if nav is not None:
# a = nav.find('a', href=lambda x: x and '/printedition/' in x)
# if a is not None:
# self.log('Following nav link to current edition', a['href'])
# soup = self.index_to_soup(process_url(a['href']))
ans = self.economist_parse_index(soup)
if not ans:
raise NoArticles(
'Could not find any articles, either the '
'economist.com server is having trouble and you should '
'try later or the website format has changed and the '
'recipe needs to be updated.'
)
return ans
feeds_dict = defaultdict(list)
for part in safe_dict(data, "props", "pageProps", "content", "hasPart",
"parts"):
section = safe_dict(part, "print", "section", "headline") or ''
title = safe_dict(part, "headline") or ''
url = safe_dict(part, "url", "canonical") or ''
if not section or not title or not url:
continue
desc = safe_dict(part, "description") or ''
sub = safe_dict(part, "subheadline") or ''
if sub and section != sub:
desc = sub + ' :: ' + desc
if '/interactive/' in url:
self.log('Skipping interactive article:', title, url)
continue
feeds_dict[section].append({"title": title, "url": url,
"description": desc})
self.log(' ', title, url, '\n ', desc)
return [(section, articles) for section, articles in
feeds_dict.items()]
else:
return []