commit 1f32097aa9c48bc210911710428241b257962417 Author: Carsten Ringe Date: Fri Nov 3 06:32:49 2023 +0100 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..85e7c1d --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/.idea/ diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..53b6ac9 --- /dev/null +++ b/go.mod @@ -0,0 +1,11 @@ +module kopis.de/readability + +go 1.21.3 + +require ( + github.com/PuerkitoBio/goquery v1.8.1 + golang.org/x/net v0.17.0 + golang.org/x/text v0.13.0 +) + +require github.com/andybalholm/cascadia v1.3.1 // indirect diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..11e6191 --- /dev/null +++ b/go.sum @@ -0,0 +1,38 @@ +github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM= +github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ= +github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= +github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= +golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/readability.go b/readability.go new file mode 100644 index 0000000..ce84c0d --- /dev/null +++ b/readability.go @@ -0,0 +1,369 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package readability + +import ( + "bytes" + "fmt" + "golang.org/x/text/encoding/htmlindex" + "io" + "log" + "math" + "net/url" + "regexp" + "strings" + + "github.com/PuerkitoBio/goquery" + "golang.org/x/net/html" +) + +const ( + defaultTagsToScore = "section,h2,h3,h4,h5,h6,p,td,pre,div" +) + +var ( + divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`) + sentenceRegexp = regexp.MustCompile(`\.( |$)`) + + blacklistCandidatesRegexp = regexp.MustCompile(`(?i)popupbody|-ad|g-plus`) + okMaybeItsACandidateRegexp = regexp.MustCompile(`(?i)and|article|body|column|main|shadow`) + unlikelyCandidatesRegexp = regexp.MustCompile(`(?i)banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`) + + negativeRegexp = regexp.MustCompile(`(?i)hidden|^hid$|hid$|hid|^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget|byline|author|dateline|writtenby|p-author`) + positiveRegexp = regexp.MustCompile(`(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`) +) + +type candidate struct { + selection *goquery.Selection + score float32 +} + +func (c *candidate) Node() *html.Node { + return c.selection.Get(0) +} + +func (c *candidate) String() string { + id, _ := c.selection.Attr("id") + class, _ := c.selection.Attr("class") + + if id != "" && class != "" { + return fmt.Sprintf("%s#%s.%s => %f", c.Node().DataAtom, id, class, c.score) + } else if id != "" { + return fmt.Sprintf("%s#%s => %f", c.Node().DataAtom, id, c.score) + } else if class != "" { + return fmt.Sprintf("%s.%s => %f", c.Node().DataAtom, class, c.score) + } + + return fmt.Sprintf("%s => %f", c.Node().DataAtom, c.score) +} + +type candidateList map[*html.Node]*candidate + +func (c candidateList) String() string { + var output []string + for _, candidate := range c { + output = append(output, candidate.String()) + } + + return strings.Join(output, ", ") +} + +// ExtractContent returns title and relevant content. +func ExtractContent(url string, page io.Reader) (string, string, error) { + content, err := io.ReadAll(page) + if err != nil { + return "", "", err + } + + document, err := readDocumentWithEncoding(content) + if err != nil { + return "", "", err + } + + document.Find("script,style").Each(func(i int, s *goquery.Selection) { + removeNodes(s) + }) + + transformMisusedDivsIntoParagraphs(document) + removeUnlikelyCandidates(document) + transformImages(url, document) + + candidates := getCandidates(document) + log.Printf("[Readability] Candidates: %v", candidates) + + topCandidate := getTopCandidate(document, candidates) + log.Printf("[Readability] TopCandidate: %v", topCandidate) + + title := getTitle(document) + output := getArticle(topCandidate, candidates) + return title, output, nil +} + +func readDocumentWithEncoding(content []byte) (*goquery.Document, error) { + var reader io.Reader + tmpDoc, err := goquery.NewDocumentFromReader(bytes.NewReader(content)) + if err == nil { + node := tmpDoc.Find("meta[charset]") + if node.Length() > 0 { + first := node.Get(0) + charset := first.Attr[0].Val + encoding, err := htmlindex.Get(charset) + if err != nil { + encoding, _ = htmlindex.Get("utf-8") + } + log.Printf("[Readability] Using encoding %s", encoding) + reader = encoding.NewDecoder().Reader(bytes.NewReader(content)) + } else { + log.Printf("[Readability] No meta charset found") + reader = bytes.NewReader(content) + } + } else { + log.Printf("[Readability] Using default encoding") + reader = bytes.NewReader(content) + } + + document, err := goquery.NewDocumentFromReader(reader) + return document, nil +} + +func getTitle(document *goquery.Document) string { + title := document.Find("title").Text() + return title +} + +// Now that we have the top candidate, look through its siblings for content that might also be related. +// Things like preambles, content split by ads that we removed, etc. +func getArticle(topCandidate *candidate, candidates candidateList) string { + output := bytes.NewBufferString("
") + siblingScoreThreshold := float32(math.Max(10, float64(topCandidate.score*.2))) + + topCandidate.selection.Siblings().Union(topCandidate.selection).Each(func(i int, s *goquery.Selection) { + append := false + node := s.Get(0) + + if node == topCandidate.Node() { + append = true + } else if c, ok := candidates[node]; ok && c.score >= siblingScoreThreshold { + append = true + } + + if s.Is("p") { + linkDensity := getLinkDensity(s) + content := s.Text() + contentLength := len(content) + + if contentLength >= 80 && linkDensity < .25 { + append = true + } else if contentLength < 80 && linkDensity == 0 && sentenceRegexp.MatchString(content) { + append = true + } + } + + if append { + tag := "div" + if s.Is("p") { + tag = node.Data + } + + html, _ := s.Html() + fmt.Fprintf(output, "<%s>%s", tag, html, tag) + } + }) + + output.Write([]byte("
")) + return output.String() +} + +func removeUnlikelyCandidates(document *goquery.Document) { + document.Find("*").Not("html,body").Each(func(i int, s *goquery.Selection) { + class, _ := s.Attr("class") + id, _ := s.Attr("id") + str := class + id + + if blacklistCandidatesRegexp.MatchString(str) || (unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str)) { + removeNodes(s) + } + }) +} + +func getTopCandidate(document *goquery.Document, candidates candidateList) *candidate { + var best *candidate + + for _, c := range candidates { + if best == nil { + best = c + } else if best.score < c.score { + best = c + } + } + + if best == nil { + best = &candidate{document.Find("body"), 0} + } + + return best +} + +// Loop through all paragraphs, and assign a score to them based on how content-y they look. +// Then add their score to their parent node. +// A score is determined by things like number of commas, class names, etc. +// Maybe eventually link density. +func getCandidates(document *goquery.Document) candidateList { + candidates := make(candidateList) + + document.Find(defaultTagsToScore).Each(func(i int, s *goquery.Selection) { + text := s.Text() + + // If this paragraph is less than 25 characters, don't even count it. + if len(text) < 25 { + return + } + + parent := s.Parent() + parentNode := parent.Get(0) + + grandParent := parent.Parent() + var grandParentNode *html.Node + if grandParent.Length() > 0 { + grandParentNode = grandParent.Get(0) + } + + if _, found := candidates[parentNode]; !found { + candidates[parentNode] = scoreNode(parent) + } + + if grandParentNode != nil { + if _, found := candidates[grandParentNode]; !found { + candidates[grandParentNode] = scoreNode(grandParent) + } + } + + // Add a point for the paragraph itself as a base. + contentScore := float32(1.0) + + // Add points for any commas within this paragraph. + contentScore += float32(strings.Count(text, ",") + 1) + + // For every 100 characters in this paragraph, add another point. Up to 3 points. + contentScore += float32(math.Min(float64(int(len(text)/100.0)), 3)) + + candidates[parentNode].score += contentScore + if grandParentNode != nil { + candidates[grandParentNode].score += contentScore / 2.0 + } + }) + + // Scale the final candidates score based on link density. Good content + // should have a relatively small link density (5% or less) and be mostly + // unaffected by this operation + for _, candidate := range candidates { + candidate.score = candidate.score * (1 - getLinkDensity(candidate.selection)) + } + + return candidates +} + +func scoreNode(s *goquery.Selection) *candidate { + c := &candidate{selection: s, score: 0} + + switch s.Get(0).DataAtom.String() { + case "div": + c.score += 5 + case "pre", "td", "blockquote", "img": + c.score += 3 + case "address", "ol", "ul", "dl", "dd", "dt", "li", "form": + c.score -= 3 + case "h1", "h2", "h3", "h4", "h5", "h6", "th": + c.score -= 5 + } + + c.score += getClassWeight(s) + return c +} + +// Get the density of links as a percentage of the content +// This is the amount of text that is inside a link divided by the total text in the node. +func getLinkDensity(s *goquery.Selection) float32 { + linkLength := len(s.Find("a").Text()) + textLength := len(s.Text()) + + if textLength == 0 { + return 0 + } + + return float32(linkLength) / float32(textLength) +} + +// Get an elements class/id weight. Uses regular expressions to tell if this +// element looks good or bad. +func getClassWeight(s *goquery.Selection) float32 { + weight := 0 + class, _ := s.Attr("class") + id, _ := s.Attr("id") + + if class != "" { + if negativeRegexp.MatchString(class) { + weight -= 25 + } + + if positiveRegexp.MatchString(class) { + weight += 25 + } + } + + if id != "" { + if negativeRegexp.MatchString(id) { + weight -= 25 + } + + if positiveRegexp.MatchString(id) { + weight += 25 + } + } + + return float32(weight) +} + +func transformMisusedDivsIntoParagraphs(document *goquery.Document) { + document.Find("div").Each(func(i int, s *goquery.Selection) { + html, _ := s.Html() + if !divToPElementsRegexp.MatchString(html) { + node := s.Get(0) + node.Data = "p" + } + }) +} + +func transformImages(url string, document *goquery.Document) { + document.Find("img").Each(func(i int, s *goquery.Selection) { + node := s.Get(0) + for j, attr := range node.Attr { + if attr.Key == "src" { + node.Attr[j].Val = buildAbsoluteUrl(url, attr.Val) + } + } + }) +} + +func removeNodes(s *goquery.Selection) { + s.Each(func(i int, s *goquery.Selection) { + parent := s.Parent() + if parent.Length() > 0 { + parent.Get(0).RemoveChild(s.Get(0)) + } + }) +} + +func buildAbsoluteUrl(baseUrl string, otherUrl string) string { + if strings.Index(otherUrl, "http") == 0 { + return otherUrl + } else if strings.Index(otherUrl, "/") == 0 { + u, _ := url.Parse(baseUrl) + return fmt.Sprintf("%s://%s%s", u.Scheme, u.Host, otherUrl) + } else { + i := strings.LastIndex(baseUrl, "/") + return fmt.Sprintf("%s/%s", baseUrl[:i], otherUrl) + } +}