Skip to content

Commit

Permalink
get title (#18)
Browse files Browse the repository at this point in the history
* initial code

* better failure

* checkpoint

* all specs pass

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint
  • Loading branch information
dogweather authored Dec 7, 2023
1 parent d1fe266 commit fb584b4
Show file tree
Hide file tree
Showing 7 changed files with 146 additions and 30 deletions.
13 changes: 9 additions & 4 deletions lib/code_gen.ex
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,29 @@ defmodule CodeGen do
"""


@spec ruby_code(binary) :: binary
def ruby_code(url) do
citations =
info =
url
|> URI.parse()
|> NewsUtil.find_citations()

citation_list =
info.citations
|> Enum.map_join(",\n ", fn cite -> "'#{cite}'" end)


"""
NewsImport.add(
Item.find_or_create_by(
url: URI('#{url}').to_s,
title: "#{info.title}",
summary: "#{info.description}",
secondary_source: Source.find_by!(name: ''),
title: "",
published_on: Date.parse(''),
summary: "",
),
[
#{citations}
#{citation_list}
]
)
"""
Expand Down
62 changes: 62 additions & 0 deletions lib/news/parser.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
defmodule News.Parser do
@moduledoc """
A module for parsing news articles.
"""

@spec find_title(
binary()
| [
binary()
| {:comment, binary()}
| {:pi | binary(), binary() | [{any(), any()}] | %{optional(binary()) => binary()},
list() | %{optional(binary()) => binary()}}
| {:doctype, binary(), binary(), binary()}
]
) :: binary
@doc """
Find the best title in the HTML tags and meta-tags.
"""
def find_title(document) do
orig_title = title_tag(document)
clean_title = title_without_hyphenation(orig_title)
h1_title = h1_tag(document)

# Whatever the h1 tag matches is definitely the best title.
# If the h1 tag doesn't match one, then just use the
# original HTML title.
cond do
clean_title == h1_title -> clean_title
orig_title == h1_title -> orig_title

true -> clean_title
end
end


# Create initial simple implementations of the missing functions.
def find_title_from_meta_tags(_html) do
"Charter School FAQ Section 99"
end


defp title_tag(document) do
document
|> Floki.find("title")
|> Floki.text()
end


defp title_without_hyphenation(title) do
title
|> String.split(~r/[-–—]/)
|> List.first
|> String.trim
end


defp h1_tag(document) do
document
|> Floki.find("h1")
|> Floki.text()
end
end
1 change: 1 addition & 0 deletions lib/news/test.ex
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ defmodule News.Test do
Test helpers.
"""

@spec fixture(binary) :: binary
def fixture(name) do
Path.join("test/fixtures", name)
end
Expand Down
52 changes: 40 additions & 12 deletions lib/news_util.ex
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,16 @@ import List

import CalCodes
import News.Http
alias News.Parser


defmodule NewsUtil do
@moduledoc false

@spec find_citations(URI.t()) :: %{citations: [binary], description: binary, title: binary}
@doc """
Find citations in a string of HTML or from a URL.
"""
@spec find_citations(URI.t) :: [binary]
def find_citations(%URI{} = uri) do
url = URI.to_string(uri)
temp_file = News.File.tmp_file!(url)
Expand All @@ -21,19 +22,38 @@ defmodule NewsUtil do
end


@spec find_citations_in_file(binary) :: [binary]
@spec find_citations_in_file(
binary
) :: %{citations: [binary], description: binary, title: binary}
def find_citations_in_file(path) do
case Path.extname(path) do
".pdf" -> find_citations_in_html(News.File.read_pdf_as_html!(path))
_ -> find_citations_in_html(File.read!(path))
html = case Path.extname(path) do
".pdf" -> News.File.read_pdf_as_html!(path)
_ -> File.read!(path)
end

find_info_in_html(html)
end


@spec find_info_in_html(binary) :: %{
citations: [binary],
description: binary,
title: binary
}
def find_info_in_html(html) do
{:ok, document} = Floki.parse_document(html)

cites = find_citations_in_html(html, document)
title = Parser.find_title(document)
descr = find_description_in_html(document)

%{citations: cites, title: title, description: descr}
end


@spec find_citations_in_html(binary) :: [binary]
defp find_citations_in_html(html) do
defp find_citations_in_html(html, document) do
cites_from_hrefs =
html
document
|> hrefs()
|> map(&href_to_cite/1)

Expand Down Expand Up @@ -62,10 +82,7 @@ defmodule NewsUtil do
end


@spec hrefs(binary) :: list[URI.t]
def hrefs(html) do
{:ok, document} = Floki.parse_document(html)

def hrefs(document) do
document
|> Floki.attribute("a", "href")
|> flatten()
Expand Down Expand Up @@ -118,4 +135,15 @@ defmodule NewsUtil do
end

defp make_cite_to_cal_codes(_), do: nil


# Retrieve the HTML description meta tag's content.
# <meta name="description" content="Questions and answers regarding charter school staffing issues." />
defp find_description_in_html(document) do
document
|> Floki.find("meta[name=description]")
|> Floki.attribute("content")
|> Floki.text()
|> String.trim()
end
end
13 changes: 0 additions & 13 deletions simple.exs

This file was deleted.

28 changes: 28 additions & 0 deletions test/news/parser_test.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
alias News.Parser
alias News.Test

defmodule News.ParserTest do
@moduledoc false
use ExUnit.Case
doctest News.Parser

@test_cases_for_title [
%{
file: "qandasec5.asp",
title: "Charter School FAQ Section 5",
},
%{
file: "qandasec6.asp",
title: "Charter School FAQ Section 6",
},
]


Enum.each(@test_cases_for_title, fn %{file: f, title: c} ->
test "finds the title in #{f}" do
{:ok, document} = Floki.parse_document(File.read!(Test.fixture(unquote f)))

assert Parser.find_title(document) == unquote(c)
end
end)
end
7 changes: 6 additions & 1 deletion test/news_util_test.exs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
alias News.Test

import NewsUtil

defmodule NewsUtilTest do
Expand Down Expand Up @@ -66,7 +68,10 @@ defmodule NewsUtilTest do

Enum.each(@test_cases, fn %{file: f, cites: c} ->
test "finds the cites in #{f}" do
assert find_citations_in_file(News.Test.fixture(unquote(f))) == unquote(c)
file = unquote(f)
cites = unquote(c)

assert %{citations: ^cites} = file |> Test.fixture |> find_citations_in_file
end
end)
end

0 comments on commit fb584b4

Please sign in to comment.