Initial commit
This commit is contained in:
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
/.idea/
|
||||||
11
go.mod
Normal file
11
go.mod
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
module kopis.de/readability
|
||||||
|
|
||||||
|
go 1.21.3
|
||||||
|
|
||||||
|
require (
|
||||||
|
github.com/PuerkitoBio/goquery v1.8.1
|
||||||
|
golang.org/x/net v0.17.0
|
||||||
|
golang.org/x/text v0.13.0
|
||||||
|
)
|
||||||
|
|
||||||
|
require github.com/andybalholm/cascadia v1.3.1 // indirect
|
||||||
38
go.sum
Normal file
38
go.sum
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM=
|
||||||
|
github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ=
|
||||||
|
github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c=
|
||||||
|
github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
|
||||||
|
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
|
||||||
|
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||||
|
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
|
||||||
|
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
|
||||||
|
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||||
|
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
||||||
|
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
|
||||||
|
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
|
||||||
|
golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
|
||||||
|
golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM=
|
||||||
|
golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE=
|
||||||
|
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||||
|
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||||
|
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||||
|
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||||
|
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||||
|
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
|
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
|
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
|
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
|
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
||||||
|
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
|
||||||
|
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
|
||||||
|
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||||
|
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||||
|
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||||
|
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
|
||||||
|
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
|
||||||
|
golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k=
|
||||||
|
golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
|
||||||
|
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||||
|
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||||
|
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
|
||||||
|
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||||
369
readability.go
Normal file
369
readability.go
Normal file
@@ -0,0 +1,369 @@
|
|||||||
|
// Copyright 2017 Frédéric Guillot. All rights reserved.
|
||||||
|
// Use of this source code is governed by the Apache 2.0
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package readability
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"fmt"
|
||||||
|
"golang.org/x/text/encoding/htmlindex"
|
||||||
|
"io"
|
||||||
|
"log"
|
||||||
|
"math"
|
||||||
|
"net/url"
|
||||||
|
"regexp"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/PuerkitoBio/goquery"
|
||||||
|
"golang.org/x/net/html"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
defaultTagsToScore = "section,h2,h3,h4,h5,h6,p,td,pre,div"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`)
|
||||||
|
sentenceRegexp = regexp.MustCompile(`\.( |$)`)
|
||||||
|
|
||||||
|
blacklistCandidatesRegexp = regexp.MustCompile(`(?i)popupbody|-ad|g-plus`)
|
||||||
|
okMaybeItsACandidateRegexp = regexp.MustCompile(`(?i)and|article|body|column|main|shadow`)
|
||||||
|
unlikelyCandidatesRegexp = regexp.MustCompile(`(?i)banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`)
|
||||||
|
|
||||||
|
negativeRegexp = regexp.MustCompile(`(?i)hidden|^hid$|hid$|hid|^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget|byline|author|dateline|writtenby|p-author`)
|
||||||
|
positiveRegexp = regexp.MustCompile(`(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`)
|
||||||
|
)
|
||||||
|
|
||||||
|
type candidate struct {
|
||||||
|
selection *goquery.Selection
|
||||||
|
score float32
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *candidate) Node() *html.Node {
|
||||||
|
return c.selection.Get(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *candidate) String() string {
|
||||||
|
id, _ := c.selection.Attr("id")
|
||||||
|
class, _ := c.selection.Attr("class")
|
||||||
|
|
||||||
|
if id != "" && class != "" {
|
||||||
|
return fmt.Sprintf("%s#%s.%s => %f", c.Node().DataAtom, id, class, c.score)
|
||||||
|
} else if id != "" {
|
||||||
|
return fmt.Sprintf("%s#%s => %f", c.Node().DataAtom, id, c.score)
|
||||||
|
} else if class != "" {
|
||||||
|
return fmt.Sprintf("%s.%s => %f", c.Node().DataAtom, class, c.score)
|
||||||
|
}
|
||||||
|
|
||||||
|
return fmt.Sprintf("%s => %f", c.Node().DataAtom, c.score)
|
||||||
|
}
|
||||||
|
|
||||||
|
type candidateList map[*html.Node]*candidate
|
||||||
|
|
||||||
|
func (c candidateList) String() string {
|
||||||
|
var output []string
|
||||||
|
for _, candidate := range c {
|
||||||
|
output = append(output, candidate.String())
|
||||||
|
}
|
||||||
|
|
||||||
|
return strings.Join(output, ", ")
|
||||||
|
}
|
||||||
|
|
||||||
|
// ExtractContent returns title and relevant content.
|
||||||
|
func ExtractContent(url string, page io.Reader) (string, string, error) {
|
||||||
|
content, err := io.ReadAll(page)
|
||||||
|
if err != nil {
|
||||||
|
return "", "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
document, err := readDocumentWithEncoding(content)
|
||||||
|
if err != nil {
|
||||||
|
return "", "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
document.Find("script,style").Each(func(i int, s *goquery.Selection) {
|
||||||
|
removeNodes(s)
|
||||||
|
})
|
||||||
|
|
||||||
|
transformMisusedDivsIntoParagraphs(document)
|
||||||
|
removeUnlikelyCandidates(document)
|
||||||
|
transformImages(url, document)
|
||||||
|
|
||||||
|
candidates := getCandidates(document)
|
||||||
|
log.Printf("[Readability] Candidates: %v", candidates)
|
||||||
|
|
||||||
|
topCandidate := getTopCandidate(document, candidates)
|
||||||
|
log.Printf("[Readability] TopCandidate: %v", topCandidate)
|
||||||
|
|
||||||
|
title := getTitle(document)
|
||||||
|
output := getArticle(topCandidate, candidates)
|
||||||
|
return title, output, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func readDocumentWithEncoding(content []byte) (*goquery.Document, error) {
|
||||||
|
var reader io.Reader
|
||||||
|
tmpDoc, err := goquery.NewDocumentFromReader(bytes.NewReader(content))
|
||||||
|
if err == nil {
|
||||||
|
node := tmpDoc.Find("meta[charset]")
|
||||||
|
if node.Length() > 0 {
|
||||||
|
first := node.Get(0)
|
||||||
|
charset := first.Attr[0].Val
|
||||||
|
encoding, err := htmlindex.Get(charset)
|
||||||
|
if err != nil {
|
||||||
|
encoding, _ = htmlindex.Get("utf-8")
|
||||||
|
}
|
||||||
|
log.Printf("[Readability] Using encoding %s", encoding)
|
||||||
|
reader = encoding.NewDecoder().Reader(bytes.NewReader(content))
|
||||||
|
} else {
|
||||||
|
log.Printf("[Readability] No meta charset found")
|
||||||
|
reader = bytes.NewReader(content)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
log.Printf("[Readability] Using default encoding")
|
||||||
|
reader = bytes.NewReader(content)
|
||||||
|
}
|
||||||
|
|
||||||
|
document, err := goquery.NewDocumentFromReader(reader)
|
||||||
|
return document, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func getTitle(document *goquery.Document) string {
|
||||||
|
title := document.Find("title").Text()
|
||||||
|
return title
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now that we have the top candidate, look through its siblings for content that might also be related.
|
||||||
|
// Things like preambles, content split by ads that we removed, etc.
|
||||||
|
func getArticle(topCandidate *candidate, candidates candidateList) string {
|
||||||
|
output := bytes.NewBufferString("<div>")
|
||||||
|
siblingScoreThreshold := float32(math.Max(10, float64(topCandidate.score*.2)))
|
||||||
|
|
||||||
|
topCandidate.selection.Siblings().Union(topCandidate.selection).Each(func(i int, s *goquery.Selection) {
|
||||||
|
append := false
|
||||||
|
node := s.Get(0)
|
||||||
|
|
||||||
|
if node == topCandidate.Node() {
|
||||||
|
append = true
|
||||||
|
} else if c, ok := candidates[node]; ok && c.score >= siblingScoreThreshold {
|
||||||
|
append = true
|
||||||
|
}
|
||||||
|
|
||||||
|
if s.Is("p") {
|
||||||
|
linkDensity := getLinkDensity(s)
|
||||||
|
content := s.Text()
|
||||||
|
contentLength := len(content)
|
||||||
|
|
||||||
|
if contentLength >= 80 && linkDensity < .25 {
|
||||||
|
append = true
|
||||||
|
} else if contentLength < 80 && linkDensity == 0 && sentenceRegexp.MatchString(content) {
|
||||||
|
append = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if append {
|
||||||
|
tag := "div"
|
||||||
|
if s.Is("p") {
|
||||||
|
tag = node.Data
|
||||||
|
}
|
||||||
|
|
||||||
|
html, _ := s.Html()
|
||||||
|
fmt.Fprintf(output, "<%s>%s</%s>", tag, html, tag)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
output.Write([]byte("</div>"))
|
||||||
|
return output.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func removeUnlikelyCandidates(document *goquery.Document) {
|
||||||
|
document.Find("*").Not("html,body").Each(func(i int, s *goquery.Selection) {
|
||||||
|
class, _ := s.Attr("class")
|
||||||
|
id, _ := s.Attr("id")
|
||||||
|
str := class + id
|
||||||
|
|
||||||
|
if blacklistCandidatesRegexp.MatchString(str) || (unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str)) {
|
||||||
|
removeNodes(s)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func getTopCandidate(document *goquery.Document, candidates candidateList) *candidate {
|
||||||
|
var best *candidate
|
||||||
|
|
||||||
|
for _, c := range candidates {
|
||||||
|
if best == nil {
|
||||||
|
best = c
|
||||||
|
} else if best.score < c.score {
|
||||||
|
best = c
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if best == nil {
|
||||||
|
best = &candidate{document.Find("body"), 0}
|
||||||
|
}
|
||||||
|
|
||||||
|
return best
|
||||||
|
}
|
||||||
|
|
||||||
|
// Loop through all paragraphs, and assign a score to them based on how content-y they look.
|
||||||
|
// Then add their score to their parent node.
|
||||||
|
// A score is determined by things like number of commas, class names, etc.
|
||||||
|
// Maybe eventually link density.
|
||||||
|
func getCandidates(document *goquery.Document) candidateList {
|
||||||
|
candidates := make(candidateList)
|
||||||
|
|
||||||
|
document.Find(defaultTagsToScore).Each(func(i int, s *goquery.Selection) {
|
||||||
|
text := s.Text()
|
||||||
|
|
||||||
|
// If this paragraph is less than 25 characters, don't even count it.
|
||||||
|
if len(text) < 25 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
parent := s.Parent()
|
||||||
|
parentNode := parent.Get(0)
|
||||||
|
|
||||||
|
grandParent := parent.Parent()
|
||||||
|
var grandParentNode *html.Node
|
||||||
|
if grandParent.Length() > 0 {
|
||||||
|
grandParentNode = grandParent.Get(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, found := candidates[parentNode]; !found {
|
||||||
|
candidates[parentNode] = scoreNode(parent)
|
||||||
|
}
|
||||||
|
|
||||||
|
if grandParentNode != nil {
|
||||||
|
if _, found := candidates[grandParentNode]; !found {
|
||||||
|
candidates[grandParentNode] = scoreNode(grandParent)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add a point for the paragraph itself as a base.
|
||||||
|
contentScore := float32(1.0)
|
||||||
|
|
||||||
|
// Add points for any commas within this paragraph.
|
||||||
|
contentScore += float32(strings.Count(text, ",") + 1)
|
||||||
|
|
||||||
|
// For every 100 characters in this paragraph, add another point. Up to 3 points.
|
||||||
|
contentScore += float32(math.Min(float64(int(len(text)/100.0)), 3))
|
||||||
|
|
||||||
|
candidates[parentNode].score += contentScore
|
||||||
|
if grandParentNode != nil {
|
||||||
|
candidates[grandParentNode].score += contentScore / 2.0
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
// Scale the final candidates score based on link density. Good content
|
||||||
|
// should have a relatively small link density (5% or less) and be mostly
|
||||||
|
// unaffected by this operation
|
||||||
|
for _, candidate := range candidates {
|
||||||
|
candidate.score = candidate.score * (1 - getLinkDensity(candidate.selection))
|
||||||
|
}
|
||||||
|
|
||||||
|
return candidates
|
||||||
|
}
|
||||||
|
|
||||||
|
func scoreNode(s *goquery.Selection) *candidate {
|
||||||
|
c := &candidate{selection: s, score: 0}
|
||||||
|
|
||||||
|
switch s.Get(0).DataAtom.String() {
|
||||||
|
case "div":
|
||||||
|
c.score += 5
|
||||||
|
case "pre", "td", "blockquote", "img":
|
||||||
|
c.score += 3
|
||||||
|
case "address", "ol", "ul", "dl", "dd", "dt", "li", "form":
|
||||||
|
c.score -= 3
|
||||||
|
case "h1", "h2", "h3", "h4", "h5", "h6", "th":
|
||||||
|
c.score -= 5
|
||||||
|
}
|
||||||
|
|
||||||
|
c.score += getClassWeight(s)
|
||||||
|
return c
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the density of links as a percentage of the content
|
||||||
|
// This is the amount of text that is inside a link divided by the total text in the node.
|
||||||
|
func getLinkDensity(s *goquery.Selection) float32 {
|
||||||
|
linkLength := len(s.Find("a").Text())
|
||||||
|
textLength := len(s.Text())
|
||||||
|
|
||||||
|
if textLength == 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
return float32(linkLength) / float32(textLength)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get an elements class/id weight. Uses regular expressions to tell if this
|
||||||
|
// element looks good or bad.
|
||||||
|
func getClassWeight(s *goquery.Selection) float32 {
|
||||||
|
weight := 0
|
||||||
|
class, _ := s.Attr("class")
|
||||||
|
id, _ := s.Attr("id")
|
||||||
|
|
||||||
|
if class != "" {
|
||||||
|
if negativeRegexp.MatchString(class) {
|
||||||
|
weight -= 25
|
||||||
|
}
|
||||||
|
|
||||||
|
if positiveRegexp.MatchString(class) {
|
||||||
|
weight += 25
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if id != "" {
|
||||||
|
if negativeRegexp.MatchString(id) {
|
||||||
|
weight -= 25
|
||||||
|
}
|
||||||
|
|
||||||
|
if positiveRegexp.MatchString(id) {
|
||||||
|
weight += 25
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return float32(weight)
|
||||||
|
}
|
||||||
|
|
||||||
|
func transformMisusedDivsIntoParagraphs(document *goquery.Document) {
|
||||||
|
document.Find("div").Each(func(i int, s *goquery.Selection) {
|
||||||
|
html, _ := s.Html()
|
||||||
|
if !divToPElementsRegexp.MatchString(html) {
|
||||||
|
node := s.Get(0)
|
||||||
|
node.Data = "p"
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func transformImages(url string, document *goquery.Document) {
|
||||||
|
document.Find("img").Each(func(i int, s *goquery.Selection) {
|
||||||
|
node := s.Get(0)
|
||||||
|
for j, attr := range node.Attr {
|
||||||
|
if attr.Key == "src" {
|
||||||
|
node.Attr[j].Val = buildAbsoluteUrl(url, attr.Val)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func removeNodes(s *goquery.Selection) {
|
||||||
|
s.Each(func(i int, s *goquery.Selection) {
|
||||||
|
parent := s.Parent()
|
||||||
|
if parent.Length() > 0 {
|
||||||
|
parent.Get(0).RemoveChild(s.Get(0))
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildAbsoluteUrl(baseUrl string, otherUrl string) string {
|
||||||
|
if strings.Index(otherUrl, "http") == 0 {
|
||||||
|
return otherUrl
|
||||||
|
} else if strings.Index(otherUrl, "/") == 0 {
|
||||||
|
u, _ := url.Parse(baseUrl)
|
||||||
|
return fmt.Sprintf("%s://%s%s", u.Scheme, u.Host, otherUrl)
|
||||||
|
} else {
|
||||||
|
i := strings.LastIndex(baseUrl, "/")
|
||||||
|
return fmt.Sprintf("%s/%s", baseUrl[:i], otherUrl)
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user