Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement issue #276 exclude certain URLs from checking, also impleme… #368

Open
wants to merge 7 commits into
base: develop
Choose a base branch
from
Open
1 change: 1 addition & 0 deletions docToolchainConfig.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ exportEA.with {
htmlSanityCheck.with {
sourceDir = 'microsite/output'
resultsFolder = 'html-sanity-check'
//exclude = ['(http|https):/example.com/excluded', '.*example-host.*']
}
//end::htmlSanityCheckConfig[]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Getter;
import lombok.Setter;
import lombok.ToString;
import lombok.extern.slf4j.Slf4j;
import org.aim42.htmlsanitycheck.check.AllCheckers;
Expand Down Expand Up @@ -45,6 +46,9 @@ public class Configuration {
@Getter(AccessLevel.NONE)
@Builder.Default
Boolean ignoreIPAddresses = false;
@Setter
Set<String> exclude;

/*
* Explanation for configuring http status codes:
* The standard http status codes are defined in class @link NetUtil and can
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
import java.net.MalformedURLException;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Pattern;


/**
Expand All @@ -23,7 +25,6 @@
*/
@Slf4j
class BrokenHttpLinksChecker extends Checker {

static {
TrustAllCertificates.install();
}
Expand All @@ -37,6 +38,7 @@ class BrokenHttpLinksChecker extends Checker {
// need that to calculate "nrOfOccurrences"
// the pure http/https-hrefs a set, duplicates are removed here
private Set<String> hrefSet;
private Set<Pattern> excludePatterns;


BrokenHttpLinksChecker(Configuration pConfig) {
Expand All @@ -45,6 +47,8 @@ class BrokenHttpLinksChecker extends Checker {
errorCodes = getMyConfig().getHttpErrorCodes();
warningCodes = getMyConfig().getHttpWarningCodes();
successCodes = getMyConfig().getHttpSuccessCodes();
Set<String> exclude = getMyConfig().getExclude();
setExclude(exclude);
}

@Override
Expand Down Expand Up @@ -82,25 +86,34 @@ private void addWarningIfNoInternetConnection() {
}

/**
* check all http(s) links
* TODO: use GPARS to check several links in parallel, as sequential checking might take too long
**/
* check all http(s) links
* TODO: use GPARS to check several links in parallel, as sequential checking might take too long
**/
private void checkAllHttpLinks() {
// for all hrefSet check if the corresponding link is valid
hrefSet.forEach(this::doubleCheckSingleHttpLink);
}

/**
* Double-Check a single http(s) link:
* Some servers don't accept head request and send errors like 403 or 405,
* instead of 200.
* Therefore, we double-check: in case of errors or warnings,
* we try again with a GET, to get the "finalResponseCode" -
* which we then categorize as success, error or warning
*/
* Double-Check a single http(s) link:
* Some servers don't accept head request and send errors like 403 or 405,
* instead of 200.
* Therefore, we double-check: in case of errors or warnings,
* we try again with a GET, to get the "finalResponseCode" -
* which we then categorize as success, error or warning
*/


protected void doubleCheckSingleHttpLink(String href) {
// Check if the href matches any of the regular expressions in the exclude set
if (excludePatterns != null) {
for (Pattern pattern : excludePatterns) {
if (pattern.matcher(href).matches()) {
// Skip checking this URL
return;
}
}
}
// bookkeeping:
getCheckingResults().incNrOfChecks();

Expand Down Expand Up @@ -224,10 +237,19 @@ private void checkIfLocalhostURL(URL url, String href) {
}


public void setExclude(Set<String> exclude) {
// Create patterns from exclude
excludePatterns = new HashSet<>();
if (exclude != null) {
for (String url : exclude) {
excludePatterns.add(Pattern.compile(url));
}
}
}
}

/*========================================================================
Copyright Gernot Starke and aim42 contributors
Copyright Gernot Starke and aim42 contributors

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -163,4 +163,43 @@ class ConfigurationSpec extends Specification {
myConfig.overrideHttpWarningCodes(null)
myConfig.overrideHttpErrorCodes(null)
}

def "can set and retrieve urlsToExclude"() {
given: "a set of URLs to exclude"
Set<String> urlsToExclude = ["(http|https)://example\\.com", "http://test\\.com"]

when: "we set the exclude in the configuration"
myConfig.exclude = urlsToExclude

then: "the configuration should contain these URLs"
myConfig.getExclude() == urlsToExclude
}

def "can set and retrieve hostsToExclude"() {
given: "a set of hosts to exclude"
Set<String> hostsToExclude = [".*example\\.com.*", ".*myhost\\.(com|org):2000"]

when: "we set the exclude in the configuration"
myConfig.exclude = hostsToExclude

then: "the configuration should contain these hosts"
myConfig.getExclude() == hostsToExclude
}

def "can set and retrieve urls and hosts to exclude"() {
given: "a set of URLs and hosts to exclude"
Set<String> exclusions = [
"(http|https)://example\\.com",
"http://test\\.com",
".*example2\\.com.*",
".*myhost\\.(com|org):23"
]

when: "we set the exclude in the configuration"
myConfig.exclude = exclusions

then: "the configuration should contain these URLs and hosts"
myConfig.getExclude() == exclusions
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ class BrokenHttpLinksCheckerSpec extends Specification {
def "regression for issue 272"(String goodUrl) {
given: "an HTML page with a single correct anchor/link"
String HTML = """$HtmlConst.HTML_HEAD
<a href=$goodUrl>url that lead to unknown host</a>
<a href="${goodUrl}">url that lead to unknown host</a>
$HtmlConst.HTML_END """

htmlPage = new HtmlPage(HTML)
Expand All @@ -145,7 +145,7 @@ class BrokenHttpLinksCheckerSpec extends Specification {
given: "an HTML page with a single (bad) link"
String badhref = "http://arc42.org:$port/ui98jfuhenu87djch"
String HTML = """$HtmlConst.HTML_HEAD
<a href=${badhref}>nonexisting arc42 link</a>
<a href="${badhref}">nonexisting arc42 link</a>
$HtmlConst.HTML_END """

htmlPage = new HtmlPage(HTML)
Expand All @@ -168,7 +168,7 @@ class BrokenHttpLinksCheckerSpec extends Specification {
given: "an HTML page with a single (good) amazon link"
String goodAmazonLink = "http://www.amazon.com:$port/dp/B01A2QL9SS"
String HTML = """$HtmlConst.HTML_HEAD
<a href=${goodAmazonLink}>Amazon</a>
<a href="${goodAmazonLink}">Amazon</a>
$HtmlConst.HTML_END """

htmlPage = new HtmlPage(HTML)
Expand All @@ -190,7 +190,7 @@ class BrokenHttpLinksCheckerSpec extends Specification {
given: "an HTML page with a single (good) amazon link"
String badAmazonLink = "https://www.amazon.com:$port/dp/4242424242"
String HTML = """$HtmlConst.HTML_HEAD
<a href=${badAmazonLink}>Amazon</a>
<a href="${badAmazonLink}">Amazon</a>
$HtmlConst.HTML_END """

htmlPage = new HtmlPage(HTML)
Expand All @@ -210,7 +210,7 @@ class BrokenHttpLinksCheckerSpec extends Specification {
given: "an HTML page with a single (broken) link"
String goodURL = "http://mock.codes$port/${badLink}"
String HTML = """$HtmlConst.HTML_HEAD
<a href=${goodURL}>${badLink}</a>
<a href="${goodURL}">${badLink}</a>
$HtmlConst.HTML_END """

htmlPage = new HtmlPage(HTML)
Expand All @@ -232,7 +232,7 @@ class BrokenHttpLinksCheckerSpec extends Specification {

given: "the old arc42 (http!) page "
String HTML = """$HtmlConst.HTML_HEAD
<a href="http://arc42.de:$port/old"</a>
<a href="http://arc42.de:$port/old"></a>
$HtmlConst.HTML_END """

htmlPage = new HtmlPage(HTML)
Expand Down Expand Up @@ -269,6 +269,69 @@ class BrokenHttpLinksCheckerSpec extends Specification {
collector?.getFindings()?.first()?.whatIsTheProblem?.contains("suspicious")

}


def "urlsToExclude are not checked"() {
given: "HTML page with url to be excluded"
String HTML = """$HtmlConst.HTML_HEAD
<a href="http://exclude-this-url.com:8080">Excluded URL</a>
<a href="https://exclude-this-url.com:8443">Excluded URL</a>
<a href="https://exclude-this-url.org:9090">Excluded URL</a>
<a href="https://exclude-also-this-url.org:7070">Excluded URL</a>
$HtmlConst.HTML_END """

htmlPage = new HtmlPage(HTML)
Set<String> urlsToExclude = ["(http|https)://exclude-this-url.*:\\d+", "(http|https)://exclude-also-this-url.org:\\d+"]
brokenHttpLinksChecker.setExclude(urlsToExclude)

when: "page is checked"
collector = brokenHttpLinksChecker.performCheck(htmlPage)

then: "no findings are reported"
collector.getFindings().isEmpty()
}

def "hostsToExclude are not checked"() {
given: "HTML page with host to be excluded"
String HTML = """$HtmlConst.HTML_HEAD
<a href="http://exclude-this-host.com:8080/page">Excluded Host</a>
<a href="http://exclude-this-host.org:8443/page">Excluded Host</a>
<a href="http://exclude-also-this-host.com:9090/page">Excluded Host</a>
<a href="http://exclude-also-this-host.com:7070/page">Excluded Host</a>
$HtmlConst.HTML_END """

htmlPage = new HtmlPage(HTML)
Set<String> hostsToExclude = [".*exclude-this-host.*:\\d+/.*", ".*exclude-also-this-host.*:\\d+/.*"]
brokenHttpLinksChecker.setExclude(hostsToExclude)

when: "page is checked"
collector = brokenHttpLinksChecker.performCheck(htmlPage)

then: "no findings are reported"
collector.getFindings().isEmpty()
}

def "mixedUrlsAndHostsToExclude are not checked"() {
given: "HTML page with mixed urls and hosts to be excluded"
String HTML = """$HtmlConst.HTML_HEAD
<a href="http://exclude-this-url.com:8080">Excluded URL</a>
<a href="https://exclude-this-host.com:8443/page">Excluded Host</a>
<a href="http://exclude-this-url.org:9090">Excluded URL</a>
<a href="https://exclude-also-this-host.org:7070/page">Excluded Host</a>
$HtmlConst.HTML_END """

htmlPage = new HtmlPage(HTML)
Set<String> mixedToExclude = ["(http|https)://exclude-this-url\\.(com|org):\\d+", ".*exclude-this-host\\..*:\\d+/.*", ".*exclude-also-this-host\\..*:\\d+/.*"]
brokenHttpLinksChecker.setExclude(mixedToExclude)

when: "page is checked"
collector = brokenHttpLinksChecker.performCheck(htmlPage)

then: "no findings are reported"
collector.getFindings().isEmpty()
}


}

/************************************************************************
Expand Down
12 changes: 12 additions & 0 deletions htmlSanityCheck-gradle-plugin/README.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,16 @@ include::../htmlSanityCheck-core/src/main/java/org/aim42/htmlsanitycheck/tools/W
The lists shown above are the default HTTP response codes handled by HSC.
The mentioned configurations effectively move the configured codes around, i.e., if you add `308` to `httpErrorCodes` it is automatically removed from its default list (`httpWarningCodes`).
****
`exclude` (optional):: A Set of regular expressions for URLs that should be excluded from the sanity check.
+
Type: Set.
+
Default: Empty list
+
[source,groovy]
----
[]
----


[[sec:examples]]
Expand Down Expand Up @@ -314,6 +324,8 @@ htmlSanityCheck {
// * MissingLocalResourcesChecker
checkerClasses = [DuplicateIdChecker, MissingImageFilesChecker]

// Exclude from checking
exclude = ["(http|https)exclude-this-url.*", ".*skip-host.org.*"]
}

----
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ class HtmlSanityCheckTask extends DefaultTask {
@Optional
@Input
Set<Integer> httpSuccessCodes
@Optional
@Input
Set<String> exclude

@Input
List<Class<? extends Checker>> checkerClasses = AllCheckers.CHECKER_CLASSES
Expand Down Expand Up @@ -187,6 +190,7 @@ See ${checkingResultsDir} for a detailed report."""
.ignoreIPAddresses(ignoreIPAddresses)

.checksToExecute(checkerClasses)
.exclude(exclude)
.build()

// in case we have configured specific interpretations of http status codes
Expand All @@ -212,6 +216,7 @@ See ${checkingResultsDir} for a detailed report."""
logger.info "Results dir : $checkingResultsDir"
logger.info "JUnit dir : $junitResultsDir"
logger.info "Fail on errors : $failOnErrors"
logger.info "exclude : $exclude"
}
}

Expand Down
Loading
Loading