メモ:Rでemojiを扱うにはどうすれば... part2

追記(2017/05/13): useBytes = TRUEにした場合は、regmatches()で抜き出すのはいけるけど、regmatches<-で置換するのはエラーになる。うーん…

x <- "<img src='test.png' alt='🍉'>"
m <- gregexpr("<\\s*[Ii][Mm][Gg]\\s+[Ss][Rr][Cc]\\s*=\\s*[\"']([^\"']+)[\"']", x, useBytes = TRUE)
regmatches(x, m)
#> [[1]]
#> [1] "<img src='test.png'"

regmatches(x, m) <- toupper(regmatches(x, m))
#> Error in nchar(u) : 
#>   number of characters is not computable in "bytes" encoding, element 1



wm <- emo::ji("watermelon")
grep(".", wm, perl = TRUE)
#> [1] 1
grepl(".", wm, perl = TRUE)
#> [1] TRUE
sub(".", "a", wm, perl = TRUE)
#> a 
gsub(".", "a", wm, perl = TRUE)
#> a 
regexpr(".", wm, perl = TRUE)
#> Error in regexpr(".", wm, perl = TRUE) : 
#>   invalid input '拷' in 'utf8towcs'
gregexpr(".", wm, perl = TRUE)
#> Error in gregexpr(".", wm, perl = TRUE) : 
#>   invalid input '拷' in 'utf8towcs'
regexec(".", wm, perl = TRUE)
#> Error in regexpr(pattern, text, ignore.case = ignore.case, useBytes = useBytes,  : 
#>   invalid input '拷' in 'utf8towcs'

useBytes = TRUEを付ければ大丈夫。

regexpr(".", wm, perl = TRUE, useBytes = TRUE)
#> [1] 1
#> attr(,"match.length")
#> [1] 1
#> attr(,"useBytes")
#> [1] TRUE
gregexpr(".", wm, perl = TRUE, useBytes = TRUE)
#> [[1]]
#> [1] 1 2 3 4
#> attr(,"match.length")
#> [1] 1 1 1 1
#> attr(,"useBytes")
#> [1] TRUE
regexec(".", wm, perl = TRUE, useBytes = TRUE)
#> [[1]]
#> [1] 1
#> attr(,"match.length")
#> [1] 1
#> attr(,"useBytes")
#> [1] TRUE


The main effect of useBytes is to avoid errors/warnings about invalid inputs and spurious matches in multibyte locales, but for regexpr it changes the interpretation of the output. It inhibits the conversion of inputs with marked encodings, and is forced if any input is found which is marked as "bytes" see Encoding).


ただなんかよくわからないのは、返ってくるオブジェクトにもuseBytesという属性があるけど、これはuseBytes =のどちらを指定してもTRUEになっている。

regexpr(".", "a", perl = TRUE, useBytes = TRUE)
#> [1] 1
#> attr(,"match.length")
#> [1] 1
#> attr(,"useBytes")
#> [1] TRUE
regexpr(".", "a", perl = TRUE)
#> [1] 1
#> attr(,"match.length")
#> [1] 1
#> attr(,"useBytes")
#> [1] TRUE


    if (!useBytes) {
    Rboolean onlyASCII = IS_ASCII(STRING_ELT(pat, 0));
    if (onlyASCII)
        for (i = 0; i < n; i++) {
        if(STRING_ELT(text, i) == NA_STRING) continue;
        if (!IS_ASCII(STRING_ELT(text, i))) {
            onlyASCII = FALSE;
    useBytes = onlyASCII;
    if (!useBytes) {
    Rboolean haveBytes = IS_BYTES(STRING_ELT(pat, 0));
    if (!haveBytes)
        for (i = 0; i < n; i++)
        if (IS_BYTES(STRING_ELT(text, i))) {
            haveBytes = TRUE;
    if(haveBytes) {
        useBytes = TRUE;

(r-source/grep.c at 504179c8d3f4bc1036854e476e7378d57ff2d774 · wch/r-source · GitHub)


regexpr(".", "分岐", perl = TRUE)
#> [1] 1
#> attr(,"match.length")
#> [1] 1


process_images <- function(html, processor) {


要は<img src="..."のタグを正規表現で捕まえられればいいわけですが…、useBytes = TRUEにしたときにマルチバイト文字の中に"とかと同じバイト列が登場しないのかがいまいち確信が持てない。
