1
0

Initial commit

This commit is contained in:
2023-11-03 06:32:49 +01:00
commit 1f32097aa9
4 changed files with 419 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
/.idea/

11
go.mod Normal file
View File

@@ -0,0 +1,11 @@
module kopis.de/readability
go 1.21.3
require (
github.com/PuerkitoBio/goquery v1.8.1
golang.org/x/net v0.17.0
golang.org/x/text v0.13.0
)
require github.com/andybalholm/cascadia v1.3.1 // indirect

38
go.sum Normal file
View File

@@ -0,0 +1,38 @@
github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM=
github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ=
github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c=
github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM=
golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k=
golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=

369
readability.go Normal file
View File

@@ -0,0 +1,369 @@
// Copyright 2017 Frédéric Guillot. All rights reserved.
// Use of this source code is governed by the Apache 2.0
// license that can be found in the LICENSE file.
package readability
import (
"bytes"
"fmt"
"golang.org/x/text/encoding/htmlindex"
"io"
"log"
"math"
"net/url"
"regexp"
"strings"
"github.com/PuerkitoBio/goquery"
"golang.org/x/net/html"
)
const (
defaultTagsToScore = "section,h2,h3,h4,h5,h6,p,td,pre,div"
)
var (
divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`)
sentenceRegexp = regexp.MustCompile(`\.( |$)`)
blacklistCandidatesRegexp = regexp.MustCompile(`(?i)popupbody|-ad|g-plus`)
okMaybeItsACandidateRegexp = regexp.MustCompile(`(?i)and|article|body|column|main|shadow`)
unlikelyCandidatesRegexp = regexp.MustCompile(`(?i)banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`)
negativeRegexp = regexp.MustCompile(`(?i)hidden|^hid$|hid$|hid|^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget|byline|author|dateline|writtenby|p-author`)
positiveRegexp = regexp.MustCompile(`(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`)
)
type candidate struct {
selection *goquery.Selection
score float32
}
func (c *candidate) Node() *html.Node {
return c.selection.Get(0)
}
func (c *candidate) String() string {
id, _ := c.selection.Attr("id")
class, _ := c.selection.Attr("class")
if id != "" && class != "" {
return fmt.Sprintf("%s#%s.%s => %f", c.Node().DataAtom, id, class, c.score)
} else if id != "" {
return fmt.Sprintf("%s#%s => %f", c.Node().DataAtom, id, c.score)
} else if class != "" {
return fmt.Sprintf("%s.%s => %f", c.Node().DataAtom, class, c.score)
}
return fmt.Sprintf("%s => %f", c.Node().DataAtom, c.score)
}
type candidateList map[*html.Node]*candidate
func (c candidateList) String() string {
var output []string
for _, candidate := range c {
output = append(output, candidate.String())
}
return strings.Join(output, ", ")
}
// ExtractContent returns title and relevant content.
func ExtractContent(url string, page io.Reader) (string, string, error) {
content, err := io.ReadAll(page)
if err != nil {
return "", "", err
}
document, err := readDocumentWithEncoding(content)
if err != nil {
return "", "", err
}
document.Find("script,style").Each(func(i int, s *goquery.Selection) {
removeNodes(s)
})
transformMisusedDivsIntoParagraphs(document)
removeUnlikelyCandidates(document)
transformImages(url, document)
candidates := getCandidates(document)
log.Printf("[Readability] Candidates: %v", candidates)
topCandidate := getTopCandidate(document, candidates)
log.Printf("[Readability] TopCandidate: %v", topCandidate)
title := getTitle(document)
output := getArticle(topCandidate, candidates)
return title, output, nil
}
func readDocumentWithEncoding(content []byte) (*goquery.Document, error) {
var reader io.Reader
tmpDoc, err := goquery.NewDocumentFromReader(bytes.NewReader(content))
if err == nil {
node := tmpDoc.Find("meta[charset]")
if node.Length() > 0 {
first := node.Get(0)
charset := first.Attr[0].Val
encoding, err := htmlindex.Get(charset)
if err != nil {
encoding, _ = htmlindex.Get("utf-8")
}
log.Printf("[Readability] Using encoding %s", encoding)
reader = encoding.NewDecoder().Reader(bytes.NewReader(content))
} else {
log.Printf("[Readability] No meta charset found")
reader = bytes.NewReader(content)
}
} else {
log.Printf("[Readability] Using default encoding")
reader = bytes.NewReader(content)
}
document, err := goquery.NewDocumentFromReader(reader)
return document, nil
}
func getTitle(document *goquery.Document) string {
title := document.Find("title").Text()
return title
}
// Now that we have the top candidate, look through its siblings for content that might also be related.
// Things like preambles, content split by ads that we removed, etc.
func getArticle(topCandidate *candidate, candidates candidateList) string {
output := bytes.NewBufferString("<div>")
siblingScoreThreshold := float32(math.Max(10, float64(topCandidate.score*.2)))
topCandidate.selection.Siblings().Union(topCandidate.selection).Each(func(i int, s *goquery.Selection) {
append := false
node := s.Get(0)
if node == topCandidate.Node() {
append = true
} else if c, ok := candidates[node]; ok && c.score >= siblingScoreThreshold {
append = true
}
if s.Is("p") {
linkDensity := getLinkDensity(s)
content := s.Text()
contentLength := len(content)
if contentLength >= 80 && linkDensity < .25 {
append = true
} else if contentLength < 80 && linkDensity == 0 && sentenceRegexp.MatchString(content) {
append = true
}
}
if append {
tag := "div"
if s.Is("p") {
tag = node.Data
}
html, _ := s.Html()
fmt.Fprintf(output, "<%s>%s</%s>", tag, html, tag)
}
})
output.Write([]byte("</div>"))
return output.String()
}
func removeUnlikelyCandidates(document *goquery.Document) {
document.Find("*").Not("html,body").Each(func(i int, s *goquery.Selection) {
class, _ := s.Attr("class")
id, _ := s.Attr("id")
str := class + id
if blacklistCandidatesRegexp.MatchString(str) || (unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str)) {
removeNodes(s)
}
})
}
func getTopCandidate(document *goquery.Document, candidates candidateList) *candidate {
var best *candidate
for _, c := range candidates {
if best == nil {
best = c
} else if best.score < c.score {
best = c
}
}
if best == nil {
best = &candidate{document.Find("body"), 0}
}
return best
}
// Loop through all paragraphs, and assign a score to them based on how content-y they look.
// Then add their score to their parent node.
// A score is determined by things like number of commas, class names, etc.
// Maybe eventually link density.
func getCandidates(document *goquery.Document) candidateList {
candidates := make(candidateList)
document.Find(defaultTagsToScore).Each(func(i int, s *goquery.Selection) {
text := s.Text()
// If this paragraph is less than 25 characters, don't even count it.
if len(text) < 25 {
return
}
parent := s.Parent()
parentNode := parent.Get(0)
grandParent := parent.Parent()
var grandParentNode *html.Node
if grandParent.Length() > 0 {
grandParentNode = grandParent.Get(0)
}
if _, found := candidates[parentNode]; !found {
candidates[parentNode] = scoreNode(parent)
}
if grandParentNode != nil {
if _, found := candidates[grandParentNode]; !found {
candidates[grandParentNode] = scoreNode(grandParent)
}
}
// Add a point for the paragraph itself as a base.
contentScore := float32(1.0)
// Add points for any commas within this paragraph.
contentScore += float32(strings.Count(text, ",") + 1)
// For every 100 characters in this paragraph, add another point. Up to 3 points.
contentScore += float32(math.Min(float64(int(len(text)/100.0)), 3))
candidates[parentNode].score += contentScore
if grandParentNode != nil {
candidates[grandParentNode].score += contentScore / 2.0
}
})
// Scale the final candidates score based on link density. Good content
// should have a relatively small link density (5% or less) and be mostly
// unaffected by this operation
for _, candidate := range candidates {
candidate.score = candidate.score * (1 - getLinkDensity(candidate.selection))
}
return candidates
}
func scoreNode(s *goquery.Selection) *candidate {
c := &candidate{selection: s, score: 0}
switch s.Get(0).DataAtom.String() {
case "div":
c.score += 5
case "pre", "td", "blockquote", "img":
c.score += 3
case "address", "ol", "ul", "dl", "dd", "dt", "li", "form":
c.score -= 3
case "h1", "h2", "h3", "h4", "h5", "h6", "th":
c.score -= 5
}
c.score += getClassWeight(s)
return c
}
// Get the density of links as a percentage of the content
// This is the amount of text that is inside a link divided by the total text in the node.
func getLinkDensity(s *goquery.Selection) float32 {
linkLength := len(s.Find("a").Text())
textLength := len(s.Text())
if textLength == 0 {
return 0
}
return float32(linkLength) / float32(textLength)
}
// Get an elements class/id weight. Uses regular expressions to tell if this
// element looks good or bad.
func getClassWeight(s *goquery.Selection) float32 {
weight := 0
class, _ := s.Attr("class")
id, _ := s.Attr("id")
if class != "" {
if negativeRegexp.MatchString(class) {
weight -= 25
}
if positiveRegexp.MatchString(class) {
weight += 25
}
}
if id != "" {
if negativeRegexp.MatchString(id) {
weight -= 25
}
if positiveRegexp.MatchString(id) {
weight += 25
}
}
return float32(weight)
}
func transformMisusedDivsIntoParagraphs(document *goquery.Document) {
document.Find("div").Each(func(i int, s *goquery.Selection) {
html, _ := s.Html()
if !divToPElementsRegexp.MatchString(html) {
node := s.Get(0)
node.Data = "p"
}
})
}
func transformImages(url string, document *goquery.Document) {
document.Find("img").Each(func(i int, s *goquery.Selection) {
node := s.Get(0)
for j, attr := range node.Attr {
if attr.Key == "src" {
node.Attr[j].Val = buildAbsoluteUrl(url, attr.Val)
}
}
})
}
func removeNodes(s *goquery.Selection) {
s.Each(func(i int, s *goquery.Selection) {
parent := s.Parent()
if parent.Length() > 0 {
parent.Get(0).RemoveChild(s.Get(0))
}
})
}
func buildAbsoluteUrl(baseUrl string, otherUrl string) string {
if strings.Index(otherUrl, "http") == 0 {
return otherUrl
} else if strings.Index(otherUrl, "/") == 0 {
u, _ := url.Parse(baseUrl)
return fmt.Sprintf("%s://%s%s", u.Scheme, u.Host, otherUrl)
} else {
i := strings.LastIndex(baseUrl, "/")
return fmt.Sprintf("%s/%s", baseUrl[:i], otherUrl)
}
}