Economist Old Edition

#!
/usr/bin/env python
# License: GPLv3 Copyright: 2008, Kovid Goyal <kovid at kovidgoyal.net>
try:
from http.cookiejar import Cookie
except ImportError:
from cookielib import Cookie
import json
from html5_parser import parse
from lxml import etree
from collections import defaultdict
from calibre import replace_entities

from calibre.ebooks.BeautifulSoup import NavigableString, Tag
from calibre.utils.date import parse_only_date
from calibre.web.feeds.news import BasicNewsRecipe
# For past editions, set date to, for example, '2020-11-28'

edition_date = '2023-11-18'
def E(parent, name, text='', **attrs):

ans = parent.makeelement(name, **attrs)
ans.text = text
parent.append(ans)
return ans
def process_node(node, html_parent):

ntype = node.get('type')
if ntype == 'tag':
c = html_parent.makeelement(node['name'])
c.attrib.update({k: v or '' for k, v in node.get('attribs', {}).items()})
html_parent.append(c)
for nc in node.get('children', ()):
process_node(nc, c)
elif ntype == 'text':
text = node.get('data')
if text:
text = replace_entities(text)
if len(html_parent):
t = html_parent[-1]
t.tail = (t.tail or '') + text
else:
html_parent.text = (html_parent.text or '') + text
def safe_dict(data, *names):

ans = data
for x in names:
ans = ans.get(x) or {}
return ans
class JSONHasNoContent(ValueError):
pass
def load_article_from_json(raw, root):
# open('/t/raw.json', 'w').write(raw)
try:
data = json.loads(raw)['props']['pageProps']['content']
except KeyError as e:
raise JSONHasNoContent(e)
if isinstance(data, list):
data = data[0]
body = root.xpath('//body')[0]
for child in tuple(body):
body.remove(child)
article = E(body, 'article')
E(article, 'h4', data['subheadline'], style='color: red; margin: 0')
E(article, 'h1', data['headline'], style='font-size: x-large')
E(article, 'div', data['description'], style='font-style: italic')
E(article, 'div', (data['datePublishedString'] or '') + ' | ' +
(data['dateline'] or ''), style='color: gray; margin: 1em')
main_image_url = safe_dict(data, 'image', 'main', 'url').get('canonical')
if main_image_url:
div = E(article, 'div')
try:
E(div, 'img', src=main_image_url)
except Exception:
pass
for node in data.get('text') or ():
process_node(node, article)
def cleanup_html_article(root):
main = root.xpath('//main')[0]
body = root.xpath('//body')[0]
for child in tuple(body):
body.remove(child)
body.append(main)
main.set('id', '')
main.tag = 'article'
for x in root.xpath('//*[@style]'):
x.set('style', '')
for x in root.xpath('//button'):
x.getparent().remove(x)
def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})
def new_tag(soup, name, attrs=()):

impl = getattr(soup, 'new_tag', None)
if impl is not None:
return impl(name, attrs=dict(attrs))
return Tag(soup, name, attrs=attrs or None)
class NoArticles(Exception):
pass
def process_url(url):
if url.startswith('/'):
url = 'https://www.economist.com' + url
return url
class Economist(BasicNewsRecipe):
title = 'The Economist'

language = 'en'
encoding = 'utf-8'
__author__ = "Kovid Goyal"

description = (
'Global news and current affairs from a European'
' perspective. Best downloaded on Friday mornings (GMT)'
)
extra_css = '''
.headline {font-size: x-large;}
h2 { font-size: small; }
h1 { font-size: medium; }
em.Bold {font-weight:bold;font-style:normal;}
em.Italic {font-style:italic;}
p.xhead {font-weight:bold;}
.pullquote {
float: right;
font-size: larger;
font-weight: bold;
font-style: italic;
page-break-inside:avoid;
border-bottom: 3px solid black;
border-top: 3px solid black;
width: 228px;
margin: 0px 0px 10px 15px;
padding: 7px 0px 9px;
}
.flytitle-and-title__flytitle {
display: block;
font-size: smaller;
color: red;
}
'''
oldest_article = 7.0
resolve_internal_links = True
remove_tags = [
dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent',
'aside', 'footer']),
dict(attrs={'aria-label': "Article Teaser"}),
dict(attrs={
'class': [
'dblClkTrk', 'ec-article-info', 'share_inline_header',
'related-items', 'main-content-container', 'ec-topic-widget',
'teaser', 'blog-post__bottom-panel-bottom', 'blog-
post__comments-label',
'blog-post__foot-note', 'blog-post__sharebar', 'blog-
post__bottom-panel',
'newsletter-form','share-links-header','teaser--wrapped',
'latest-updates-panel__container',
'latest-updates-panel__article-link','blog-post__section'
]
}
),
dict(attrs={
'class': lambda x: x and 'blog-post__siblings-list-aside' in
x.split()}),
classes(
'share-links-header teaser--wrapped latest-updates-panel__container'
' latest-updates-panel__article-link blog-post__section newsletter-form
blog-post__bottom-panel'
)
]
keep_only_tags = [dict(name='article', id=lambda x: not x)]
no_stylesheets = True
remove_attributes = ['data-reactid', 'width', 'height']
# economist.com has started throttling after about 60% of the total has
# downloaded with connection reset by peer (104) errors.
delay = 1
needs_subscription = False
def __init__(self, *args, **kwargs):

BasicNewsRecipe.__init__(self, *args, **kwargs)
if self.output_profile.short_name.startswith('kindle'):
# Reduce image sizes to get file size below amazon's email
# sending threshold
self.web2disk_options.compress_news_images = True
self.web2disk_options.compress_news_images_auto_size = 5
self.log.warn('Kindle Output profile being used, reducing image quality
to keep file size below amazon email threshold')
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
# Add a cookie indicating we have accepted Economist's cookie
# policy (needed when running from some European countries)
ck = Cookie(
version=0,
name='notice_preferences',
value='2:',
port=None,
port_specified=False,
domain='.economist.com',
domain_specified=False,
domain_initial_dot=True,
path='/',
path_specified=False,
secure=False,
expires=None,
discard=False,
comment=None,
comment_url=None,
rest={'HttpOnly': None},
rfc2109=False
)
br.cookiejar.set_cookie(ck)
br.set_handle_gzip(True)
return br
def preprocess_raw_html(self, raw, url):

# open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
root = parse(raw)
script = root.xpath('//script[@id="__NEXT_DATA__"]')
if script:
try:
load_article_from_json(script[0].text, root)
except JSONHasNoContent:
cleanup_html_article(root)
for div in root.xpath('//div[@class="lazy-image"]'):
noscript = list(div.iter('noscript'))
if noscript and noscript[0].text:
img = list(parse(noscript[0].text).iter('img'))
if img:
p = noscript[0].getparent()
idx = p.index(noscript[0])
p.insert(idx, p.makeelement('img', src=img[0].get('src')))
p.remove(noscript[0])
for x in root.xpath('//*[name()="script" or name()="style" or
name()="source" or name()="meta"]'):
x.getparent().remove(x)
# the economist uses <small> for small caps with a custom font
for x in root.xpath('//small'):
if x.text and len(x) == 0:
x.text = x.text.upper()
x.tag = 'span'
x.set('style', 'font-variant: small-caps')
raw = etree.tostring(root, encoding='unicode')
return raw
def publication_date(self):
if edition_date:
return parse_only_date(edition_date, as_utc=False)
url = self.browser.open("https://www.economist.com/printedition").geturl()
return parse_only_date(url.split("/")[-1], as_utc=False)
def parse_index(self):
# return [('Articles', [{'title':'test',
#
'url':'https://www.economist.com/interactive/briefing/2022/06/11/huge-foundation-
models-are-turbo-charging-ai-progress'
# }])]
if edition_date:
url = 'https://www.economist.com/weeklyedition/' + edition_date
self.timefmt = ' [' + edition_date + ']'
else:
url = 'https://www.economist.com/printedition'
# raw = open('/t/raw.html').read()
raw = self.index_to_soup(url, raw=True)
# with open('/t/raw.html', 'wb') as f:
# f.write(raw)
soup = self.index_to_soup(raw)
# nav = soup.find(attrs={'class':'navigation__wrapper'})
# if nav is not None:
# a = nav.find('a', href=lambda x: x and '/printedition/' in x)
# if a is not None:
# self.log('Following nav link to current edition', a['href'])
# soup = self.index_to_soup(process_url(a['href']))
ans = self.economist_parse_index(soup)
if not ans:
raise NoArticles(
'Could not find any articles, either the '
'economist.com server is having trouble and you should '
'try later or the website format has changed and the '
'recipe needs to be updated.'
)
return ans
def economist_parse_index(self, soup):

script_tag = soup.find("script", id="__NEXT_DATA__")
if script_tag is not None:
data = json.loads(script_tag.string)
# open('/t/raw.json', 'w').write(json.dumps(data, indent=2,
sort_keys=True))
self.cover_url = safe_dict(data, "props", "pageProps", "content",
"image", "main", "url", "canonical")
self.log('Got cover:', self.cover_url)
feeds_dict = defaultdict(list)
for part in safe_dict(data, "props", "pageProps", "content", "hasPart",
"parts"):
section = safe_dict(part, "print", "section", "headline") or ''
title = safe_dict(part, "headline") or ''
url = safe_dict(part, "url", "canonical") or ''
if not section or not title or not url:
continue
desc = safe_dict(part, "description") or ''
sub = safe_dict(part, "subheadline") or ''
if sub and section != sub:
desc = sub + ' :: ' + desc
if '/interactive/' in url:
self.log('Skipping interactive article:', title, url)
continue
feeds_dict[section].append({"title": title, "url": url,
"description": desc})
self.log(' ', title, url, '\n ', desc)
return [(section, articles) for section, articles in
feeds_dict.items()]
else:
return []
def eco_find_image_tables(self, soup):

for x in soup.findAll('table', align=['right', 'center']):
if len(x.findAll('font')) in (1, 2) and len(x.findAll('img')) == 1:
yield x
def postprocess_html(self, soup, first):

for img in soup.findAll('img', srcset=True):
del img['srcset']
for table in list(self.eco_find_image_tables(soup)):
caption = table.find('font')
img = table.find('img')
div = new_tag(soup, 'div')
div['style'] = 'text-align:left;font-size:70%'
ns = NavigableString(self.tag_to_string(caption))
div.insert(0, ns)
div.insert(1, new_tag(soup, 'br'))
del img['width']
del img['height']
img.extract()
div.insert(2, img)
table.replaceWith(div)
return soup
def canonicalize_internal_url(self, url, is_link=True):

if url.endswith('/print'):
url = url.rpartition('/')[0]
return BasicNewsRecipe.canonicalize_internal_url(self, url,
is_link=is_link)

Economist Old Edition

Uploaded by

Copyright:

Available Formats

Economist Old Edition

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Economist Old Edition

Uploaded by

Copyright:

Available Formats

#!

from calibre import replace_entities

# For past editions, set date to, for example, '2020-11-28'

def E(parent, name, text='', **attrs):

def process_node(node, html_parent):

def safe_dict(data, *names):

def new_tag(soup, name, attrs=()):

title = 'The Economist'

author = "Kovid Goyal"

def init(self, *args, **kwargs):

def preprocess_raw_html(self, raw, url):

def economist_parse_index(self, soup):

def eco_find_image_tables(self, soup):

def postprocess_html(self, soup, first):

def canonicalize_internal_url(self, url, is_link=True):

You might also like