+
Skip to content

update coursera example #537

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 31, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 28 additions & 20 deletions _examples/coursera_courses/coursera_courses.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,18 @@ type Course struct {
URL string
Language string
Commitment string
HowToPass string
Rating string
}

func main() {
fName := "courses.json"
file, err := os.Create(fName)
if err != nil {
log.Fatalf("Cannot create file %q: %s\n", fName, err)
return
}
defer file.Close()

// Instantiate default collector
c := colly.NewCollector(
// Visit only domains: coursera.org, www.coursera.org
Expand All @@ -38,7 +45,7 @@ func main() {

courses := make([]Course, 0, 200)

// On every a element which has href attribute call callback
// On every <a> element which has "href" attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
// If attribute class is this long string return from callback
// As this a is irrelevant
Expand All @@ -59,8 +66,8 @@ func main() {
log.Println("visiting", r.URL.String())
})

// On every a HTML element which has name attribute call callback
c.OnHTML(`a[name]`, func(e *colly.HTMLElement) {
// On every <a> element with collection-product-card class call callback
c.OnHTML(`a.collection-product-card`, func(e *colly.HTMLElement) {
// Activate detailCollector if the link contains "coursera.org/learn"
courseURL := e.Request.AbsoluteURL(e.Attr("href"))
if strings.Index(courseURL, "coursera.org/learn") != -1 {
Expand All @@ -71,30 +78,31 @@ func main() {
// Extract details of the course
detailCollector.OnHTML(`div[id=rendered-content]`, func(e *colly.HTMLElement) {
log.Println("Course found", e.Request.URL)
title := e.ChildText(".course-title")
title := e.ChildText(".banner-title")
if title == "" {
log.Println("No title found", e.Request.URL)
}
course := Course{
Title: title,
URL: e.Request.URL.String(),
Description: e.ChildText("div.content"),
Creator: e.ChildText("div.creator-names > span"),
Creator: e.ChildText("li.banner-instructor-info > a > div > div > span"),
Rating: e.ChildText("span.number-rating"),
}
// Iterate over rows of the table which contains different information
// about the course
e.ForEach("table.basic-info-table tr", func(_ int, el *colly.HTMLElement) {
switch el.ChildText("td:first-child") {
case "Language":
course.Language = el.ChildText("td:nth-child(2)")
// Iterate over div components and add details to course
e.ForEach(".AboutCourse .ProductGlance > div", func(_ int, el *colly.HTMLElement) {
svgTitle := strings.Split(el.ChildText("div:nth-child(1) svg title"), " ")
lastWord := svgTitle[len(svgTitle)-1]
switch lastWord {
// svg Title: Available Langauges
case "languages":
course.Language = el.ChildText("div:nth-child(2) > div:nth-child(1)")
// svg Title: Mixed/Beginner/Intermediate/Advanced Level
case "Level":
course.Level = el.ChildText("td:nth-child(2)")
case "Commitment":
course.Commitment = el.ChildText("td:nth-child(2)")
case "How To Pass":
course.HowToPass = el.ChildText("td:nth-child(2)")
case "User Ratings":
course.Rating = el.ChildText("td:nth-child(2) div:nth-of-type(2)")
course.Level = el.ChildText("div:nth-child(2) > div:nth-child(1)")
// svg Title: Hours to complete
case "complete":
course.Commitment = el.ChildText("div:nth-child(2) > div:nth-child(1)")
}
})
courses = append(courses, course)
Expand All @@ -103,7 +111,7 @@ func main() {
// Start scraping on http://coursera.com/browse
c.Visit("https://coursera.org/browse")

enc := json.NewEncoder(os.Stdout)
enc := json.NewEncoder(file)
enc.SetIndent("", " ")

// Dump json to the standard output
Expand Down
点击 这是indexloc提供的php浏览器服务,不要输入任何密码和下载