From 4045eb2bc5ed2822914e3ff51e66aa4735638885 Mon Sep 17 00:00:00 2001 From: Jeroen Ooms Date: Thu, 3 Jul 2025 13:12:41 +0200 Subject: [PATCH 1/6] CI across more distros --- .github/workflows/R-CMD-check.yaml | 11 ++++----- .github/workflows/distros.yml | 39 ++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/distros.yml diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index cd88fc94..6159693f 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -14,6 +14,10 @@ name: R-CMD-check.yaml permissions: read-all +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: R-CMD-check: runs-on: ${{ matrix.config.os }} @@ -24,20 +28,15 @@ jobs: fail-fast: false matrix: config: + - {os: macos-13, r: 'release'} - {os: macos-latest, r: 'release'} - - {os: windows-latest, r: 'release'} - {os: windows-latest, r: 'oldrel-1'} - {os: windows-latest, r: 'oldrel-2'} - {os: windows-latest, r: 'oldrel-3'} - {os: windows-latest, r: 'oldrel-4'} - - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} - {os: ubuntu-latest, r: 'release'} - - {os: ubuntu-latest, r: 'oldrel-1'} - - {os: ubuntu-latest, r: 'oldrel-2'} - - {os: ubuntu-latest, r: 'oldrel-3'} - - {os: ubuntu-latest, r: 'oldrel-4'} env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/distros.yml b/.github/workflows/distros.yml new file mode 100644 index 00000000..9425cb1b --- /dev/null +++ b/.github/workflows/distros.yml @@ -0,0 +1,39 @@ +name: Legacy Distros + +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + rhel: + runs-on: ubuntu-24.04${{matrix.arch=='arm64' && '-arm' || ''}} + name: ${{ matrix.distro }} ${{ matrix.arch }} + strategy: + fail-fast: false + matrix: + distro: [ 'rocky-8', 'rocky-9', 'ubuntu-20.04', 'ubuntu-22.04', 'debian-10', 'debian-11', 'debian-12'] + #arch: [ 'amd64', 'arm64' ] + arch: [ 'amd64' ] + container: + image: ghcr.io/r-devel/${{ matrix.distro }}:latest + steps: + - uses: actions/checkout@v4 + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::rcmdcheck + needs: check + + - uses: r-lib/actions/check-r-package@v2 + with: + args: '"--no-manual"' + env: + NOT_CRAN: false + _R_CHECK_DOC_SIZES_: FALSE + LANG: en_US.UTF-8 From 4db87d51d3f00a330d111bdae431a3e5fa34f3fc Mon Sep 17 00:00:00 2001 From: Jeroen Ooms Date: Thu, 3 Jul 2025 15:24:41 +0200 Subject: [PATCH 2/6] Test issue #456 --- .github/workflows/distros.yml | 2 +- tests/testthat/test-xml_parse.R | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/distros.yml b/.github/workflows/distros.yml index 9425cb1b..45684ded 100644 --- a/.github/workflows/distros.yml +++ b/.github/workflows/distros.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - distro: [ 'rocky-8', 'rocky-9', 'ubuntu-20.04', 'ubuntu-22.04', 'debian-10', 'debian-11', 'debian-12'] + distro: [ 'rocky-8', 'rocky-9', 'ubuntu-20.04', 'ubuntu-22.04', 'debian-10', 'debian-11', 'debian-12', 'debian-13', 'fedora'] #arch: [ 'amd64', 'arm64' ] arch: [ 'amd64' ] container: diff --git a/tests/testthat/test-xml_parse.R b/tests/testthat/test-xml_parse.R index b030b71d..fbe66e10 100644 --- a/tests/testthat/test-xml_parse.R +++ b/tests/testthat/test-xml_parse.R @@ -111,3 +111,8 @@ test_that("read_xml and read_html fail with > 1 input", { read_html(c("foo", "bar")) }) }) + +test_that("Truncated HTML should not error", { + res <- read_html(' Date: Thu, 3 Jul 2025 16:39:50 +0200 Subject: [PATCH 3/6] disable test --- tests/testthat/test-xml_parse.R | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test-xml_parse.R b/tests/testthat/test-xml_parse.R index fbe66e10..ebd68bb7 100644 --- a/tests/testthat/test-xml_parse.R +++ b/tests/testthat/test-xml_parse.R @@ -112,7 +112,8 @@ test_that("read_xml and read_html fail with > 1 input", { }) }) -test_that("Truncated HTML should not error", { - res <- read_html(' Date: Thu, 3 Jul 2025 16:42:36 +0200 Subject: [PATCH 4/6] Allow big HTML by default I don't see any downside from enabling this, and it is common with rmarkdown et al. See issue #455 --- DESCRIPTION | 2 +- R/xml_parse.R | 4 ++-- man/read_xml.Rd | 7 ++++++- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c44b42ae..3efad521 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -36,7 +36,7 @@ VignetteBuilder: Config/Needs/website: tidyverse/tidytemplate Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.2 SystemRequirements: libxml2: libxml2-dev (deb), libxml2-devel (rpm) Collate: 'S4.R' diff --git a/R/xml_parse.R b/R/xml_parse.R index 7efc1451..a89b5bf5 100644 --- a/R/xml_parse.R +++ b/R/xml_parse.R @@ -69,7 +69,7 @@ read_xml <- function(x, encoding = "", ..., as_html = FALSE, options = "NOBLANKS read_html <- function(x, encoding = "", ..., - options = c("RECOVER", "NOERROR", "NOBLANKS")) { + options = c("RECOVER", "NOERROR", "NOBLANKS", "HUGE")) { UseMethod("read_html") } @@ -77,7 +77,7 @@ read_html <- function(x, read_html.default <- function(x, encoding = "", ..., - options = c("RECOVER", "NOERROR", "NOBLANKS")) { + options = c("RECOVER", "NOERROR", "NOBLANKS", "HUGE")) { options <- parse_options(options, xml_parse_options()) suppressWarnings(read_xml(x, encoding = encoding, ..., as_html = TRUE, options = options)) diff --git a/man/read_xml.Rd b/man/read_xml.Rd index d5967041..dbcf883f 100644 --- a/man/read_xml.Rd +++ b/man/read_xml.Rd @@ -10,7 +10,12 @@ \usage{ read_xml(x, encoding = "", ..., as_html = FALSE, options = "NOBLANKS") -read_html(x, encoding = "", ..., options = c("RECOVER", "NOERROR", "NOBLANKS")) +read_html( + x, + encoding = "", + ..., + options = c("RECOVER", "NOERROR", "NOBLANKS", "HUGE") +) \method{read_xml}{character}(x, encoding = "", ..., as_html = FALSE, options = "NOBLANKS") From f9483fc4582ad7e4c876a3bd9093a911215148c8 Mon Sep 17 00:00:00 2001 From: Jeroen Ooms Date: Thu, 3 Jul 2025 16:44:42 +0200 Subject: [PATCH 5/6] Bump version as common practice --- DESCRIPTION | 2 +- NEWS.md | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3efad521..9bc8d6aa 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: xml2 Title: Parse XML -Version: 1.3.8 +Version: 1.3.9000 Authors@R: c( person("Hadley", "Wickham", role = "aut"), person("Jim", "Hester", role = "aut"), diff --git a/NEWS.md b/NEWS.md index f98be4e3..c70fcc05 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# xml2 1.4.0 + +* read_html() now allows huge elements by default (#455) + # xml2 1.3.8 * Replace new "non-api" call IS_S4_OBJECT with Rf_isS4 From b05d22a6e72bf94ce58a1483344db477969a9a98 Mon Sep 17 00:00:00 2001 From: Jeroen Ooms Date: Mon, 7 Jul 2025 20:51:30 +0200 Subject: [PATCH 6/6] Workaround for xQuartz/Cocoa on MacOS hitting our global error handler. Fixes #458 --- NEWS.md | 2 ++ src/xml2_init.c | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/NEWS.md b/NEWS.md index c70fcc05..bc676418 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,8 @@ * read_html() now allows huge elements by default (#455) +* Workaround for xQuartz/Cocoa on MacOS hitting our global error handler. + # xml2 1.3.8 * Replace new "non-api" call IS_S4_OBJECT with Rf_isS4 diff --git a/src/xml2_init.c b/src/xml2_init.c index d34e15c3..6451a83f 100644 --- a/src/xml2_init.c +++ b/src/xml2_init.c @@ -22,6 +22,15 @@ void handleStructuredError(void* userData, xmlError* error) { error->message[len-1] = '\0'; } + //Workaround for https://github.com/r-lib/xml2/issues/458 +#ifdef __APPLE__ + xmlParserCtxt *ctxt = error->ctxt; + static unsigned char icns[5] = { 'i', 'c', 'n', 's', '\0' }; + if(error->code == XML_ERR_DOCUMENT_EMPTY && ctxt->input && ctxt->input->base && xmlStrcmp(ctxt->input->base, icns) == 0){ + return; + } +#endif + if (error->level <= 2) { Rf_warning("%s [%i]", error->message, (int) error->code); } else {