字元串處理中基本函數的使用 R自帶函數與stringr包函數對比 ...
字元串處理中基本函數的使用
R自帶函數與stringr包函數對比
> states <- row.names(USArrests)
> # 提取字元串子集
> substr(x = states, start = 1, stop = 4)
[1] "Alab" "Alas" "Ariz" "Arka" "Cali" "Colo" "Conn" "Dela" "Flor" "Geor" "Hawa" "Idah" "Illi" "Indi" "Iowa" "Kans" "Kent"
[18] "Loui" "Main" "Mary" "Mass" "Mich" "Minn" "Miss" "Miss" "Mont" "Nebr" "Neva" "New " "New " "New " "New " "Nort" "Nort"
[35] "Ohio" "Okla" "Oreg" "Penn" "Rhod" "Sout" "Sout" "Tenn" "Texa" "Utah" "Verm" "Virg" "Wash" "West" "Wisc" "Wyom"
> abbreviate(states, minlength = 5)
Alabama Alaska Arizona Arkansas California Colorado Connecticut Delaware
"Alabm" "Alask" "Arizn" "Arkns" "Clfrn" "Colrd" "Cnnct" "Delwr"
Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas
"Flord" "Georg" "Hawai" "Idaho" "Illns" "Indin" "Iowa" "Kanss"
Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota Mississippi
"Kntck" "Lousn" "Maine" "Mryln" "Mssch" "Mchgn" "Mnnst" "Mssss"
Missouri Montana Nebraska Nevada New Hampshire New Jersey New Mexico New York
"Missr" "Montn" "Nbrsk" "Nevad" "NwHmp" "NwJrs" "NwMxc" "NwYrk"
North Carolina North Dakota Ohio Oklahoma Oregon Pennsylvania Rhode Island South Carolina
"NrthC" "NrthD" "Ohio" "Oklhm" "Oregn" "Pnnsy" "RhdIs" "SthCr"
South Dakota Tennessee Texas Utah Vermont Virginia Washington West Virginia
"SthDk" "Tnnss" "Texas" "Utah" "Vrmnt" "Virgn" "Wshng" "WstVr"
Wisconsin Wyoming
"Wscns" "Wymng"
> # 計算字元串長度
> nchar(states)
[1] 7 6 7 8 10 8 11 8 7 7 6 5 8 7 4 6 8 9 5 8 13 8 9 11 8 7 8 6 13 10 10 8 14 12 4 8 6 12 12 14 12
[42] 9 5 4 7 8 10 13 9 7
> str_count(states)
[1] 7 6 7 8 10 8 11 8 7 7 6 5 8 7 4 6 8 9 5 8 13 8 9 11 8 7 8 6 13 10 10 8 14 12 4 8 6 12 12 14 12
[42] 9 5 4 7 8 10 13 9 7
> str_length(states)
[1] 7 6 7 8 10 8 11 8 7 7 6 5 8 7 4 6 8 9 5 8 13 8 9 11 8 7 8 6 13 10 10 8 14 12 4 8 6 12 12 14 12
[42] 9 5 4 7 8 10 13 9 7
> # 大寫和小寫
> tolower(states) # 變為小寫
[1] "alabama" "alaska" "arizona" "arkansas" "california" "colorado" "connecticut"
[8] "delaware" "florida" "georgia" "hawaii" "idaho" "illinois" "indiana"
[15] "iowa" "kansas" "kentucky" "louisiana" "maine" "maryland" "massachusetts"
[22] "michigan" "minnesota" "mississippi" "missouri" "montana" "nebraska" "nevada"
[29] "new hampshire" "new jersey" "new mexico" "new york" "north carolina" "north dakota" "ohio"
[36] "oklahoma" "oregon" "pennsylvania" "rhode island" "south carolina" "south dakota" "tennessee"
[43] "texas" "utah" "vermont" "virginia" "washington" "west virginia" "wisconsin"
[50] "wyoming"
> toupper(states) # 變為大寫
[1] "ALABAMA" "ALASKA" "ARIZONA" "ARKANSAS" "CALIFORNIA" "COLORADO" "CONNECTICUT"
[8] "DELAWARE" "FLORIDA" "GEORGIA" "HAWAII" "IDAHO" "ILLINOIS" "INDIANA"
[15] "IOWA" "KANSAS" "KENTUCKY" "LOUISIANA" "MAINE" "MARYLAND" "MASSACHUSETTS"
[22] "MICHIGAN" "MINNESOTA" "MISSISSIPPI" "MISSOURI" "MONTANA" "NEBRASKA" "NEVADA"
[29] "NEW HAMPSHIRE" "NEW JERSEY" "NEW MEXICO" "NEW YORK" "NORTH CAROLINA" "NORTH DAKOTA" "OHIO"
[36] "OKLAHOMA" "OREGON" "PENNSYLVANIA" "RHODE ISLAND" "SOUTH CAROLINA" "SOUTH DAKOTA" "TENNESSEE"
[43] "TEXAS" "UTAH" "VERMONT" "VIRGINIA" "WASHINGTON" "WEST VIRGINIA" "WISCONSIN"
[50] "WYOMING"
> # 符號替換
> chartr("Tt", "Uu", "AgCTcctTagct")
[1] "AgCUccuUagcu"
> str_replace_all("AgCTcctTagct", pattern = "T", replacement = "U")
[1] "AgCUcctUagct"
> # 字元串連接
> paste("control", 1:3, sep = "_")
[1] "control_1" "control_2" "control_3"
> str_c("control", 1:3, sep = "_")
[1] "control_1" "control_2" "control_3"
> x <- c("I love R", "I'm fascinated by Statisitcs", "I")
> # 包含匹配
> grep(pattern = "love", x = x)
[1] 1
> grep(pattern = "love", x = x, value = TRUE)
[1] "I love R"
> grepl(pattern = "love", x = x)
[1] TRUE FALSE FALSE
> str_detect(string = x, pattern = "love")
[1] TRUE FALSE FALSE
> # match返回第一個完全匹配的位置
> match(x = "I",table = x)
[1] 3
> "I" %in% x
[1] TRUE
> # 字元串拆分
> text <- "I love R.\nI'm fascinated by Statisitcs."
> cat(text)
I love R.
I'm fascinated by Statisitcs.
> strsplit(text, split = " ")
[[1]]
[1] "I" "love" "R.\nI'm" "fascinated" "by" "Statisitcs."
> strsplit(text, split = "\\s")
[[1]]
[1] "I" "love" "R." "I'm" "fascinated" "by" "Statisitcs."
> str_split(text, pattern = "\\s")
[[1]]
[1] "I" "love" "R." "I'm" "fascinated" "by" "Statisitcs."
> # 匹配替換
> test_vector3 <- c("Without the vowels,We can still read the word.")
> sub(pattern = "[aeiou]",replacement = "-",x = test_vector3)
[1] "W-thout the vowels,We can still read the word."
> gsub(pattern = "[aeiou]",replacement = "-",x = test_vector3)
[1] "W-th--t th- v-w-ls,W- c-n st-ll r--d th- w-rd."
> str_replace_all(string = test_vector3, pattern = "[aeiou]",
+ replacement = "-")
[1] "W-th--t th- v-w-ls,W- c-n st-ll r--d th- w-rd."
> # 字元串定製輸出
> string <- "Each character string in the input is first split into\n paragraphs
+ (or lines containing whitespace)"
> strwrap(x = string, width = 30)
[1] "Each character string in the" "input is first split into" "paragraphs (or lines" "containing whitespace)"
> str_wrap(string = string, width = 30)
[1] "Each character string in\nthe input is first split\ninto paragraphs (or lines\ncontaining whitespace)"
> cat(str_wrap(string = string, width = 30))
Each character string in
the input is first split
into paragraphs (or lines
containing whitespace)