Skip to content

Commit 5e0558f

Browse files
committed
feat: support for 2024-09-06 step
1 parent 30a3d49 commit 5e0558f

File tree

10 files changed

+184
-3
lines changed

10 files changed

+184
-3
lines changed
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
module SerpParser
2+
module Google
3+
module OrganicResults
4+
class FeaturedResult1 < SerpParser::Google::Search
5+
include SerpParser::Google::OrganicResults::Shared
6+
include SerpParser::Helpers
7+
8+
# @return [String]
9+
SELECTOR = "div.Gx5Zad.xpd.EtOod.pkphOe"
10+
REQUIRED_CHILDREN = ["span div.BNeawe span.rQMQod.Xb5VRe"]
11+
12+
# List of allowed span elements (determined by class name) in the description.
13+
# @return [Array]
14+
ALLOWED_DESCRIPTION_ELEMENTS = [ "r0bn4c rQMQod" ]
15+
16+
# The schema for the organic result.
17+
# @return [Hash]
18+
SCHEMA = {
19+
title: {
20+
type: :instance_method
21+
},
22+
description: {
23+
type: :instance_method
24+
},
25+
url: {
26+
type: :instance_method
27+
},
28+
site_links: {
29+
type: :collection,
30+
parsers: [
31+
SerpParser::Google::OrganicResults::SiteLinks::SiteLink2
32+
]
33+
},
34+
rating: {
35+
type: :hash,
36+
parsers: [
37+
SerpParser::Google::OrganicResults::Ratings::Rating2
38+
]
39+
}
40+
}
41+
42+
# Returns the title of the result
43+
# @return [String]
44+
def title
45+
element = @doc.css("span.rQMQod.Xb5VRe")
46+
clean_text element.text
47+
end
48+
49+
# Returns the URL of the result
50+
# @return [String]
51+
def url
52+
element = @doc.css(".kCrYT a")
53+
clean_google_url element.first["href"]
54+
end
55+
56+
# Returns the description of the result
57+
# @return [String]
58+
def description
59+
element = @doc.css(".PqksIc.nRlVm")
60+
return if element.nil?
61+
62+
clean_text element.text
63+
end
64+
65+
private
66+
67+
# Find the text node that contains the description text
68+
# @param element [Nokogiri::XML::Element]
69+
# @return [Nokogiri::XML::Element]
70+
def find_description_text_node(element)
71+
element.reverse_each.find do |node|
72+
node.children.all? do |child|
73+
child.text? || (child.element? && child.name == "span" && (child["class"].nil? || child["class"].strip.empty? || ALLOWED_DESCRIPTION_ELEMENTS.include?(child["class"])))
74+
end
75+
end
76+
end
77+
end
78+
end
79+
end
80+
end

lib/serp_parser/google/organic_results/organic_result1.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ class OrganicResult1 < SerpParser::Google::Search
77

88
# @return [String]
99
SELECTOR = "div.g.Ww4FFb"
10+
REQUIRED_CHILDREN = ["h3"]
1011

1112
# The schema for the organic result.
1213
# @return [Hash]

lib/serp_parser/google/organic_results/organic_result2.rb

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ class OrganicResult2 < SerpParser::Google::Search
66
include SerpParser::Helpers
77

88
# @return [String]
9-
SELECTOR = "div.Gx5Zad.fP1Qef.xpd.EtOod.pkphOe"
9+
SELECTOR = "div.Gx5Zad.xpd.EtOod.pkphOe"
10+
REQUIRED_CHILDREN = ["h3"]
1011

1112
# List of allowed span elements (determined by class name) in the description.
1213
# @return [Array]
@@ -59,6 +60,8 @@ def description
5960
element = @doc.css(".BNeawe.s3v9rd.AP7Wnd")
6061
element = find_description_text_node(element)
6162
element = remove_span_elements(element)
63+
return if element.nil?
64+
6265
clean_text element.text
6366
end
6467

lib/serp_parser/google/search.rb

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@ class Search < SerpParser::Parser
66
organic_results: {
77
type: :collection,
88
parsers: [
9-
SerpParser::Google::OrganicResults::OrganicResult1
9+
SerpParser::Google::OrganicResults::FeaturedResult1,
10+
SerpParser::Google::OrganicResults::OrganicResult1,
11+
SerpParser::Google::OrganicResults::OrganicResult2,
1012
]
1113
},
1214
search_information: {

lib/serp_parser/parser.rb

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,16 @@ def build_merged_hash(parsers, default_values: {})
6868
def parse_children(parsers)
6969
parsers.flat_map do |parser|
7070
@doc.css(parser::SELECTOR).map do |element|
71+
if defined?(parser::REQUIRED_CHILDREN) && parser::REQUIRED_CHILDREN.any?
72+
# Check if the element contains the required child elements (direct children only)
73+
required_children_exist = parser::REQUIRED_CHILDREN.all? do |child_selector|
74+
!element.children.css(child_selector).empty?
75+
end
76+
next unless required_children_exist
77+
end
78+
7179
parser.new(element).processed_data
72-
end
80+
end.compact # Remove nil values resulting from the `next` statement
7381
end
7482
end
7583

spec/files/google/2024-09-06.html

Lines changed: 13 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
<div><div class="Gx5Zad xpd EtOod pkphOe"><div class="egMi0 kCrYT"><a href="/url?q=https://middagskassen.se/stad/goteborg/&amp;sa=U&amp;ved=2ahUKEwiuhOiSpq6IAxWKpZUCHY0LByIQFnoECGIQAg&amp;usg=AOvVaw29sektXHtJGqYYCXhPYQOh" data-ved="2ahUKEwiuhOiSpq6IAxWKpZUCHY0LByIQFnoECGIQAg"><div class="DnJfK"><div class="j039Wc"><h3 class="zBAuLc l97dzf"><div class="BNeawe vvjwJb AP7Wnd UwRFLe" style="-webkit-line-clamp:2">Hela listan (2024). - Matkassar i Göteborg med hemleverans</div></h3></div><div class="sCuL3"><div class="BNeawe UPmit AP7Wnd lRVwie">middagskassen.se › Städer</div></div></div></a></div><div class="kCrYT"><div><div class="BNeawe s3v9rd AP7Wnd"><div><div><div class="BNeawe s3v9rd AP7Wnd">Lista med leverantörer som kör ut matkassar till adresser i Göteborg och nära orter. ✓Rabattkoder. ✓Omdömen. ✓Bäst i Test 2024.</div></div></div></div></div></div></div></div>
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{
2+
"title": "Hela listan (2024). - Matkassar i Göteborg med hemleverans",
3+
"description": "Lista med leverantörer som kör ut matkassar till adresser i Göteborg och nära orter. ✓Rabattkoder. ✓Omdömen. ✓Bäst i Test 2024.",
4+
"url": "https://middagskassen.se/stad/goteborg/",
5+
"date": null,
6+
"rating": null,
7+
"site_links": []
8+
}
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# frozen_string_literal: true
2+
3+
require "spec_helper"
4+
require "json"
5+
6+
RSpec.describe SerpParser::Google::OrganicResults::OrganicResult2 do
7+
it_behaves_like "matches the expected output", "google/organic_result3/general_result"
8+
end
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# frozen_string_literal: true
2+
3+
require "spec_helper"
4+
5+
RSpec.describe SerpParser::Google::Search do
6+
let(:html) { File.read("spec/files/google/2024-09-06.html") }
7+
let(:doc) { Nokogiri::HTML::DocumentFragment.parse(html) }
8+
let(:parser) { described_class.new(html) }
9+
let(:parser_with_html) { described_class.new(html) }
10+
11+
describe "#initialize" do
12+
it "initializes with doc" do
13+
expect { parser }.not_to raise_error
14+
end
15+
16+
it "initializes with html" do
17+
expect { parser_with_html }.not_to raise_error
18+
end
19+
end
20+
21+
describe "#search_information" do
22+
it "returns hash with information" do
23+
expect(parser.search_information).to be_an_instance_of(Hash)
24+
end
25+
end
26+
27+
describe "#organic_results" do
28+
it "returns a collection object" do
29+
expect(parser.organic_results).to be_an_instance_of(SerpParser::Collection)
30+
end
31+
32+
it "returns organic results" do
33+
expect(parser.organic_results).to all(be_an_instance_of(SerpParser::Models::OrganicResult))
34+
end
35+
36+
it "returns 98 organic results" do
37+
expect(parser.organic_results.size).to eq(99)
38+
end
39+
40+
it "returns featured post as first organic result" do
41+
expect(parser.organic_results.first.title).to eq("Matkasse Göteborg | Jämför de bästa matkassarna 2024")
42+
expect(parser.organic_results.first.description).to match("Bästa matkassarna i Göteborg")
43+
expect(parser.organic_results.first.description).to match("Beställ från HelloFresh.")
44+
expect(parser.organic_results.first.url).to eq("https://matkassen.nu/matkasse-goteborg/")
45+
end
46+
47+
describe "#site_links of first result" do
48+
it "returns an array of site links" do
49+
expect(parser.organic_results[9].site_links).to all(be_an_instance_of(SerpParser::Models::OrganicResults::SiteLink))
50+
end
51+
52+
it "returns 4 site links" do
53+
expect(parser.organic_results[9].site_links.size).to eq(4)
54+
end
55+
end
56+
end
57+
end

0 commit comments

Comments
 (0)