-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* initial code * better failure * checkpoint * all specs pass * checkpoint * checkpoint * checkpoint * checkpoint * checkpoint * checkpoint * checkpoint * checkpoint * checkpoint * checkpoint * checkpoint * checkpoint * checkpoint * checkpoint * checkpoint * checkpoint * checkpoint * checkpoint * checkpoint * checkpoint
- Loading branch information
1 parent
d1fe266
commit fb584b4
Showing
7 changed files
with
146 additions
and
30 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
defmodule News.Parser do | ||
@moduledoc """ | ||
A module for parsing news articles. | ||
""" | ||
|
||
@spec find_title( | ||
binary() | ||
| [ | ||
binary() | ||
| {:comment, binary()} | ||
| {:pi | binary(), binary() | [{any(), any()}] | %{optional(binary()) => binary()}, | ||
list() | %{optional(binary()) => binary()}} | ||
| {:doctype, binary(), binary(), binary()} | ||
] | ||
) :: binary | ||
@doc """ | ||
Find the best title in the HTML tags and meta-tags. | ||
""" | ||
def find_title(document) do | ||
orig_title = title_tag(document) | ||
clean_title = title_without_hyphenation(orig_title) | ||
h1_title = h1_tag(document) | ||
|
||
# Whatever the h1 tag matches is definitely the best title. | ||
# If the h1 tag doesn't match one, then just use the | ||
# original HTML title. | ||
cond do | ||
clean_title == h1_title -> clean_title | ||
orig_title == h1_title -> orig_title | ||
|
||
true -> clean_title | ||
end | ||
end | ||
|
||
|
||
# Create initial simple implementations of the missing functions. | ||
def find_title_from_meta_tags(_html) do | ||
"Charter School FAQ Section 99" | ||
end | ||
|
||
|
||
defp title_tag(document) do | ||
document | ||
|> Floki.find("title") | ||
|> Floki.text() | ||
end | ||
|
||
|
||
defp title_without_hyphenation(title) do | ||
title | ||
|> String.split(~r/[-–—]/) | ||
|> List.first | ||
|> String.trim | ||
end | ||
|
||
|
||
defp h1_tag(document) do | ||
document | ||
|> Floki.find("h1") | ||
|> Floki.text() | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
alias News.Parser | ||
alias News.Test | ||
|
||
defmodule News.ParserTest do | ||
@moduledoc false | ||
use ExUnit.Case | ||
doctest News.Parser | ||
|
||
@test_cases_for_title [ | ||
%{ | ||
file: "qandasec5.asp", | ||
title: "Charter School FAQ Section 5", | ||
}, | ||
%{ | ||
file: "qandasec6.asp", | ||
title: "Charter School FAQ Section 6", | ||
}, | ||
] | ||
|
||
|
||
Enum.each(@test_cases_for_title, fn %{file: f, title: c} -> | ||
test "finds the title in #{f}" do | ||
{:ok, document} = Floki.parse_document(File.read!(Test.fixture(unquote f))) | ||
|
||
assert Parser.find_title(document) == unquote(c) | ||
end | ||
end) | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters