bytes.go 31 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276
  1. // Copyright 2009 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Package bytes implements functions for the manipulation of byte slices.
  5. // It is analogous to the facilities of the strings package.
  6. package bytes
  7. import (
  8. "internal/bytealg"
  9. "unicode"
  10. "unicode/utf8"
  11. )
  12. // Equal reports whether a and b
  13. // are the same length and contain the same bytes.
  14. // A nil argument is equivalent to an empty slice.
  15. func Equal(a, b []byte) bool {
  16. // Neither cmd/compile nor gccgo allocates for these string conversions.
  17. return string(a) == string(b)
  18. }
  19. // Compare returns an integer comparing two byte slices lexicographically.
  20. // The result will be 0 if a == b, -1 if a < b, and +1 if a > b.
  21. // A nil argument is equivalent to an empty slice.
  22. func Compare(a, b []byte) int {
  23. return bytealg.Compare(a, b)
  24. }
  25. // explode splits s into a slice of UTF-8 sequences, one per Unicode code point (still slices of bytes),
  26. // up to a maximum of n byte slices. Invalid UTF-8 sequences are chopped into individual bytes.
  27. func explode(s []byte, n int) [][]byte {
  28. if n <= 0 {
  29. n = len(s)
  30. }
  31. a := make([][]byte, n)
  32. var size int
  33. na := 0
  34. for len(s) > 0 {
  35. if na+1 >= n {
  36. a[na] = s
  37. na++
  38. break
  39. }
  40. _, size = utf8.DecodeRune(s)
  41. a[na] = s[0:size:size]
  42. s = s[size:]
  43. na++
  44. }
  45. return a[0:na]
  46. }
  47. // Count counts the number of non-overlapping instances of sep in s.
  48. // If sep is an empty slice, Count returns 1 + the number of UTF-8-encoded code points in s.
  49. func Count(s, sep []byte) int {
  50. // special case
  51. if len(sep) == 0 {
  52. return utf8.RuneCount(s) + 1
  53. }
  54. if len(sep) == 1 {
  55. return bytealg.Count(s, sep[0])
  56. }
  57. n := 0
  58. for {
  59. i := Index(s, sep)
  60. if i == -1 {
  61. return n
  62. }
  63. n++
  64. s = s[i+len(sep):]
  65. }
  66. }
  67. // Contains reports whether subslice is within b.
  68. func Contains(b, subslice []byte) bool {
  69. return Index(b, subslice) != -1
  70. }
  71. // ContainsAny reports whether any of the UTF-8-encoded code points in chars are within b.
  72. func ContainsAny(b []byte, chars string) bool {
  73. return IndexAny(b, chars) >= 0
  74. }
  75. // ContainsRune reports whether the rune is contained in the UTF-8-encoded byte slice b.
  76. func ContainsRune(b []byte, r rune) bool {
  77. return IndexRune(b, r) >= 0
  78. }
  79. // IndexByte returns the index of the first instance of c in b, or -1 if c is not present in b.
  80. func IndexByte(b []byte, c byte) int {
  81. return bytealg.IndexByte(b, c)
  82. }
  83. func indexBytePortable(s []byte, c byte) int {
  84. for i, b := range s {
  85. if b == c {
  86. return i
  87. }
  88. }
  89. return -1
  90. }
  91. // LastIndex returns the index of the last instance of sep in s, or -1 if sep is not present in s.
  92. func LastIndex(s, sep []byte) int {
  93. n := len(sep)
  94. switch {
  95. case n == 0:
  96. return len(s)
  97. case n == 1:
  98. return LastIndexByte(s, sep[0])
  99. case n == len(s):
  100. if Equal(s, sep) {
  101. return 0
  102. }
  103. return -1
  104. case n > len(s):
  105. return -1
  106. }
  107. // Rabin-Karp search from the end of the string
  108. hashss, pow := bytealg.HashStrRevBytes(sep)
  109. last := len(s) - n
  110. var h uint32
  111. for i := len(s) - 1; i >= last; i-- {
  112. h = h*bytealg.PrimeRK + uint32(s[i])
  113. }
  114. if h == hashss && Equal(s[last:], sep) {
  115. return last
  116. }
  117. for i := last - 1; i >= 0; i-- {
  118. h *= bytealg.PrimeRK
  119. h += uint32(s[i])
  120. h -= pow * uint32(s[i+n])
  121. if h == hashss && Equal(s[i:i+n], sep) {
  122. return i
  123. }
  124. }
  125. return -1
  126. }
  127. // LastIndexByte returns the index of the last instance of c in s, or -1 if c is not present in s.
  128. func LastIndexByte(s []byte, c byte) int {
  129. for i := len(s) - 1; i >= 0; i-- {
  130. if s[i] == c {
  131. return i
  132. }
  133. }
  134. return -1
  135. }
  136. // IndexRune interprets s as a sequence of UTF-8-encoded code points.
  137. // It returns the byte index of the first occurrence in s of the given rune.
  138. // It returns -1 if rune is not present in s.
  139. // If r is utf8.RuneError, it returns the first instance of any
  140. // invalid UTF-8 byte sequence.
  141. func IndexRune(s []byte, r rune) int {
  142. switch {
  143. case 0 <= r && r < utf8.RuneSelf:
  144. return IndexByte(s, byte(r))
  145. case r == utf8.RuneError:
  146. for i := 0; i < len(s); {
  147. r1, n := utf8.DecodeRune(s[i:])
  148. if r1 == utf8.RuneError {
  149. return i
  150. }
  151. i += n
  152. }
  153. return -1
  154. case !utf8.ValidRune(r):
  155. return -1
  156. default:
  157. var b [utf8.UTFMax]byte
  158. n := utf8.EncodeRune(b[:], r)
  159. return Index(s, b[:n])
  160. }
  161. }
  162. // IndexAny interprets s as a sequence of UTF-8-encoded Unicode code points.
  163. // It returns the byte index of the first occurrence in s of any of the Unicode
  164. // code points in chars. It returns -1 if chars is empty or if there is no code
  165. // point in common.
  166. func IndexAny(s []byte, chars string) int {
  167. if chars == "" {
  168. // Avoid scanning all of s.
  169. return -1
  170. }
  171. if len(s) == 1 {
  172. r := rune(s[0])
  173. if r >= utf8.RuneSelf {
  174. // search utf8.RuneError.
  175. for _, r = range chars {
  176. if r == utf8.RuneError {
  177. return 0
  178. }
  179. }
  180. return -1
  181. }
  182. if bytealg.IndexByteString(chars, s[0]) >= 0 {
  183. return 0
  184. }
  185. return -1
  186. }
  187. if len(chars) == 1 {
  188. r := rune(chars[0])
  189. if r >= utf8.RuneSelf {
  190. r = utf8.RuneError
  191. }
  192. return IndexRune(s, r)
  193. }
  194. if len(s) > 8 {
  195. if as, isASCII := makeASCIISet(chars); isASCII {
  196. for i, c := range s {
  197. if as.contains(c) {
  198. return i
  199. }
  200. }
  201. return -1
  202. }
  203. }
  204. var width int
  205. for i := 0; i < len(s); i += width {
  206. r := rune(s[i])
  207. if r < utf8.RuneSelf {
  208. if bytealg.IndexByteString(chars, s[i]) >= 0 {
  209. return i
  210. }
  211. width = 1
  212. continue
  213. }
  214. r, width = utf8.DecodeRune(s[i:])
  215. if r != utf8.RuneError {
  216. // r is 2 to 4 bytes
  217. if len(chars) == width {
  218. if chars == string(r) {
  219. return i
  220. }
  221. continue
  222. }
  223. // Use bytealg.IndexString for performance if available.
  224. if bytealg.MaxLen >= width {
  225. if bytealg.IndexString(chars, string(r)) >= 0 {
  226. return i
  227. }
  228. continue
  229. }
  230. }
  231. for _, ch := range chars {
  232. if r == ch {
  233. return i
  234. }
  235. }
  236. }
  237. return -1
  238. }
  239. // LastIndexAny interprets s as a sequence of UTF-8-encoded Unicode code
  240. // points. It returns the byte index of the last occurrence in s of any of
  241. // the Unicode code points in chars. It returns -1 if chars is empty or if
  242. // there is no code point in common.
  243. func LastIndexAny(s []byte, chars string) int {
  244. if chars == "" {
  245. // Avoid scanning all of s.
  246. return -1
  247. }
  248. if len(s) > 8 {
  249. if as, isASCII := makeASCIISet(chars); isASCII {
  250. for i := len(s) - 1; i >= 0; i-- {
  251. if as.contains(s[i]) {
  252. return i
  253. }
  254. }
  255. return -1
  256. }
  257. }
  258. if len(s) == 1 {
  259. r := rune(s[0])
  260. if r >= utf8.RuneSelf {
  261. for _, r = range chars {
  262. if r == utf8.RuneError {
  263. return 0
  264. }
  265. }
  266. return -1
  267. }
  268. if bytealg.IndexByteString(chars, s[0]) >= 0 {
  269. return 0
  270. }
  271. return -1
  272. }
  273. if len(chars) == 1 {
  274. cr := rune(chars[0])
  275. if cr >= utf8.RuneSelf {
  276. cr = utf8.RuneError
  277. }
  278. for i := len(s); i > 0; {
  279. r, size := utf8.DecodeLastRune(s[:i])
  280. i -= size
  281. if r == cr {
  282. return i
  283. }
  284. }
  285. return -1
  286. }
  287. for i := len(s); i > 0; {
  288. r := rune(s[i-1])
  289. if r < utf8.RuneSelf {
  290. if bytealg.IndexByteString(chars, s[i-1]) >= 0 {
  291. return i - 1
  292. }
  293. i--
  294. continue
  295. }
  296. r, size := utf8.DecodeLastRune(s[:i])
  297. i -= size
  298. if r != utf8.RuneError {
  299. // r is 2 to 4 bytes
  300. if len(chars) == size {
  301. if chars == string(r) {
  302. return i
  303. }
  304. continue
  305. }
  306. // Use bytealg.IndexString for performance if available.
  307. if bytealg.MaxLen >= size {
  308. if bytealg.IndexString(chars, string(r)) >= 0 {
  309. return i
  310. }
  311. continue
  312. }
  313. }
  314. for _, ch := range chars {
  315. if r == ch {
  316. return i
  317. }
  318. }
  319. }
  320. return -1
  321. }
  322. // Generic split: splits after each instance of sep,
  323. // including sepSave bytes of sep in the subslices.
  324. func genSplit(s, sep []byte, sepSave, n int) [][]byte {
  325. if n == 0 {
  326. return nil
  327. }
  328. if len(sep) == 0 {
  329. return explode(s, n)
  330. }
  331. if n < 0 {
  332. n = Count(s, sep) + 1
  333. }
  334. a := make([][]byte, n)
  335. n--
  336. i := 0
  337. for i < n {
  338. m := Index(s, sep)
  339. if m < 0 {
  340. break
  341. }
  342. a[i] = s[: m+sepSave : m+sepSave]
  343. s = s[m+len(sep):]
  344. i++
  345. }
  346. a[i] = s
  347. return a[:i+1]
  348. }
  349. // SplitN slices s into subslices separated by sep and returns a slice of
  350. // the subslices between those separators.
  351. // If sep is empty, SplitN splits after each UTF-8 sequence.
  352. // The count determines the number of subslices to return:
  353. // n > 0: at most n subslices; the last subslice will be the unsplit remainder.
  354. // n == 0: the result is nil (zero subslices)
  355. // n < 0: all subslices
  356. //
  357. // To split around the first instance of a separator, see Cut.
  358. func SplitN(s, sep []byte, n int) [][]byte { return genSplit(s, sep, 0, n) }
  359. // SplitAfterN slices s into subslices after each instance of sep and
  360. // returns a slice of those subslices.
  361. // If sep is empty, SplitAfterN splits after each UTF-8 sequence.
  362. // The count determines the number of subslices to return:
  363. // n > 0: at most n subslices; the last subslice will be the unsplit remainder.
  364. // n == 0: the result is nil (zero subslices)
  365. // n < 0: all subslices
  366. func SplitAfterN(s, sep []byte, n int) [][]byte {
  367. return genSplit(s, sep, len(sep), n)
  368. }
  369. // Split slices s into all subslices separated by sep and returns a slice of
  370. // the subslices between those separators.
  371. // If sep is empty, Split splits after each UTF-8 sequence.
  372. // It is equivalent to SplitN with a count of -1.
  373. //
  374. // To split around the first instance of a separator, see Cut.
  375. func Split(s, sep []byte) [][]byte { return genSplit(s, sep, 0, -1) }
  376. // SplitAfter slices s into all subslices after each instance of sep and
  377. // returns a slice of those subslices.
  378. // If sep is empty, SplitAfter splits after each UTF-8 sequence.
  379. // It is equivalent to SplitAfterN with a count of -1.
  380. func SplitAfter(s, sep []byte) [][]byte {
  381. return genSplit(s, sep, len(sep), -1)
  382. }
  383. var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1}
  384. // Fields interprets s as a sequence of UTF-8-encoded code points.
  385. // It splits the slice s around each instance of one or more consecutive white space
  386. // characters, as defined by unicode.IsSpace, returning a slice of subslices of s or an
  387. // empty slice if s contains only white space.
  388. func Fields(s []byte) [][]byte {
  389. // First count the fields.
  390. // This is an exact count if s is ASCII, otherwise it is an approximation.
  391. n := 0
  392. wasSpace := 1
  393. // setBits is used to track which bits are set in the bytes of s.
  394. setBits := uint8(0)
  395. for i := 0; i < len(s); i++ {
  396. r := s[i]
  397. setBits |= r
  398. isSpace := int(asciiSpace[r])
  399. n += wasSpace & ^isSpace
  400. wasSpace = isSpace
  401. }
  402. if setBits >= utf8.RuneSelf {
  403. // Some runes in the input slice are not ASCII.
  404. return FieldsFunc(s, unicode.IsSpace)
  405. }
  406. // ASCII fast path
  407. a := make([][]byte, n)
  408. na := 0
  409. fieldStart := 0
  410. i := 0
  411. // Skip spaces in the front of the input.
  412. for i < len(s) && asciiSpace[s[i]] != 0 {
  413. i++
  414. }
  415. fieldStart = i
  416. for i < len(s) {
  417. if asciiSpace[s[i]] == 0 {
  418. i++
  419. continue
  420. }
  421. a[na] = s[fieldStart:i:i]
  422. na++
  423. i++
  424. // Skip spaces in between fields.
  425. for i < len(s) && asciiSpace[s[i]] != 0 {
  426. i++
  427. }
  428. fieldStart = i
  429. }
  430. if fieldStart < len(s) { // Last field might end at EOF.
  431. a[na] = s[fieldStart:len(s):len(s)]
  432. }
  433. return a
  434. }
  435. // FieldsFunc interprets s as a sequence of UTF-8-encoded code points.
  436. // It splits the slice s at each run of code points c satisfying f(c) and
  437. // returns a slice of subslices of s. If all code points in s satisfy f(c), or
  438. // len(s) == 0, an empty slice is returned.
  439. //
  440. // FieldsFunc makes no guarantees about the order in which it calls f(c)
  441. // and assumes that f always returns the same value for a given c.
  442. func FieldsFunc(s []byte, f func(rune) bool) [][]byte {
  443. // A span is used to record a slice of s of the form s[start:end].
  444. // The start index is inclusive and the end index is exclusive.
  445. type span struct {
  446. start int
  447. end int
  448. }
  449. spans := make([]span, 0, 32)
  450. // Find the field start and end indices.
  451. // Doing this in a separate pass (rather than slicing the string s
  452. // and collecting the result substrings right away) is significantly
  453. // more efficient, possibly due to cache effects.
  454. start := -1 // valid span start if >= 0
  455. for i := 0; i < len(s); {
  456. size := 1
  457. r := rune(s[i])
  458. if r >= utf8.RuneSelf {
  459. r, size = utf8.DecodeRune(s[i:])
  460. }
  461. if f(r) {
  462. if start >= 0 {
  463. spans = append(spans, span{start, i})
  464. start = -1
  465. }
  466. } else {
  467. if start < 0 {
  468. start = i
  469. }
  470. }
  471. i += size
  472. }
  473. // Last field might end at EOF.
  474. if start >= 0 {
  475. spans = append(spans, span{start, len(s)})
  476. }
  477. // Create subslices from recorded field indices.
  478. a := make([][]byte, len(spans))
  479. for i, span := range spans {
  480. a[i] = s[span.start:span.end:span.end]
  481. }
  482. return a
  483. }
  484. // Join concatenates the elements of s to create a new byte slice. The separator
  485. // sep is placed between elements in the resulting slice.
  486. func Join(s [][]byte, sep []byte) []byte {
  487. if len(s) == 0 {
  488. return []byte{}
  489. }
  490. if len(s) == 1 {
  491. // Just return a copy.
  492. return append([]byte(nil), s[0]...)
  493. }
  494. n := len(sep) * (len(s) - 1)
  495. for _, v := range s {
  496. n += len(v)
  497. }
  498. b := make([]byte, n)
  499. bp := copy(b, s[0])
  500. for _, v := range s[1:] {
  501. bp += copy(b[bp:], sep)
  502. bp += copy(b[bp:], v)
  503. }
  504. return b
  505. }
  506. // HasPrefix tests whether the byte slice s begins with prefix.
  507. func HasPrefix(s, prefix []byte) bool {
  508. return len(s) >= len(prefix) && Equal(s[0:len(prefix)], prefix)
  509. }
  510. // HasSuffix tests whether the byte slice s ends with suffix.
  511. func HasSuffix(s, suffix []byte) bool {
  512. return len(s) >= len(suffix) && Equal(s[len(s)-len(suffix):], suffix)
  513. }
  514. // Map returns a copy of the byte slice s with all its characters modified
  515. // according to the mapping function. If mapping returns a negative value, the character is
  516. // dropped from the byte slice with no replacement. The characters in s and the
  517. // output are interpreted as UTF-8-encoded code points.
  518. func Map(mapping func(r rune) rune, s []byte) []byte {
  519. // In the worst case, the slice can grow when mapped, making
  520. // things unpleasant. But it's so rare we barge in assuming it's
  521. // fine. It could also shrink but that falls out naturally.
  522. maxbytes := len(s) // length of b
  523. nbytes := 0 // number of bytes encoded in b
  524. b := make([]byte, maxbytes)
  525. for i := 0; i < len(s); {
  526. wid := 1
  527. r := rune(s[i])
  528. if r >= utf8.RuneSelf {
  529. r, wid = utf8.DecodeRune(s[i:])
  530. }
  531. r = mapping(r)
  532. if r >= 0 {
  533. rl := utf8.RuneLen(r)
  534. if rl < 0 {
  535. rl = len(string(utf8.RuneError))
  536. }
  537. if nbytes+rl > maxbytes {
  538. // Grow the buffer.
  539. maxbytes = maxbytes*2 + utf8.UTFMax
  540. nb := make([]byte, maxbytes)
  541. copy(nb, b[0:nbytes])
  542. b = nb
  543. }
  544. nbytes += utf8.EncodeRune(b[nbytes:maxbytes], r)
  545. }
  546. i += wid
  547. }
  548. return b[0:nbytes]
  549. }
  550. // Repeat returns a new byte slice consisting of count copies of b.
  551. //
  552. // It panics if count is negative or if
  553. // the result of (len(b) * count) overflows.
  554. func Repeat(b []byte, count int) []byte {
  555. if count == 0 {
  556. return []byte{}
  557. }
  558. // Since we cannot return an error on overflow,
  559. // we should panic if the repeat will generate
  560. // an overflow.
  561. // See Issue golang.org/issue/16237.
  562. if count < 0 {
  563. panic("bytes: negative Repeat count")
  564. } else if len(b)*count/count != len(b) {
  565. panic("bytes: Repeat count causes overflow")
  566. }
  567. nb := make([]byte, len(b)*count)
  568. bp := copy(nb, b)
  569. for bp < len(nb) {
  570. copy(nb[bp:], nb[:bp])
  571. bp *= 2
  572. }
  573. return nb
  574. }
  575. // ToUpper returns a copy of the byte slice s with all Unicode letters mapped to
  576. // their upper case.
  577. func ToUpper(s []byte) []byte {
  578. isASCII, hasLower := true, false
  579. for i := 0; i < len(s); i++ {
  580. c := s[i]
  581. if c >= utf8.RuneSelf {
  582. isASCII = false
  583. break
  584. }
  585. hasLower = hasLower || ('a' <= c && c <= 'z')
  586. }
  587. if isASCII { // optimize for ASCII-only byte slices.
  588. if !hasLower {
  589. // Just return a copy.
  590. return append([]byte(""), s...)
  591. }
  592. b := make([]byte, len(s))
  593. for i := 0; i < len(s); i++ {
  594. c := s[i]
  595. if 'a' <= c && c <= 'z' {
  596. c -= 'a' - 'A'
  597. }
  598. b[i] = c
  599. }
  600. return b
  601. }
  602. return Map(unicode.ToUpper, s)
  603. }
  604. // ToLower returns a copy of the byte slice s with all Unicode letters mapped to
  605. // their lower case.
  606. func ToLower(s []byte) []byte {
  607. isASCII, hasUpper := true, false
  608. for i := 0; i < len(s); i++ {
  609. c := s[i]
  610. if c >= utf8.RuneSelf {
  611. isASCII = false
  612. break
  613. }
  614. hasUpper = hasUpper || ('A' <= c && c <= 'Z')
  615. }
  616. if isASCII { // optimize for ASCII-only byte slices.
  617. if !hasUpper {
  618. return append([]byte(""), s...)
  619. }
  620. b := make([]byte, len(s))
  621. for i := 0; i < len(s); i++ {
  622. c := s[i]
  623. if 'A' <= c && c <= 'Z' {
  624. c += 'a' - 'A'
  625. }
  626. b[i] = c
  627. }
  628. return b
  629. }
  630. return Map(unicode.ToLower, s)
  631. }
  632. // ToTitle treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their title case.
  633. func ToTitle(s []byte) []byte { return Map(unicode.ToTitle, s) }
  634. // ToUpperSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their
  635. // upper case, giving priority to the special casing rules.
  636. func ToUpperSpecial(c unicode.SpecialCase, s []byte) []byte {
  637. return Map(c.ToUpper, s)
  638. }
  639. // ToLowerSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their
  640. // lower case, giving priority to the special casing rules.
  641. func ToLowerSpecial(c unicode.SpecialCase, s []byte) []byte {
  642. return Map(c.ToLower, s)
  643. }
  644. // ToTitleSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their
  645. // title case, giving priority to the special casing rules.
  646. func ToTitleSpecial(c unicode.SpecialCase, s []byte) []byte {
  647. return Map(c.ToTitle, s)
  648. }
  649. // ToValidUTF8 treats s as UTF-8-encoded bytes and returns a copy with each run of bytes
  650. // representing invalid UTF-8 replaced with the bytes in replacement, which may be empty.
  651. func ToValidUTF8(s, replacement []byte) []byte {
  652. b := make([]byte, 0, len(s)+len(replacement))
  653. invalid := false // previous byte was from an invalid UTF-8 sequence
  654. for i := 0; i < len(s); {
  655. c := s[i]
  656. if c < utf8.RuneSelf {
  657. i++
  658. invalid = false
  659. b = append(b, c)
  660. continue
  661. }
  662. _, wid := utf8.DecodeRune(s[i:])
  663. if wid == 1 {
  664. i++
  665. if !invalid {
  666. invalid = true
  667. b = append(b, replacement...)
  668. }
  669. continue
  670. }
  671. invalid = false
  672. b = append(b, s[i:i+wid]...)
  673. i += wid
  674. }
  675. return b
  676. }
  677. // isSeparator reports whether the rune could mark a word boundary.
  678. // TODO: update when package unicode captures more of the properties.
  679. func isSeparator(r rune) bool {
  680. // ASCII alphanumerics and underscore are not separators
  681. if r <= 0x7F {
  682. switch {
  683. case '0' <= r && r <= '9':
  684. return false
  685. case 'a' <= r && r <= 'z':
  686. return false
  687. case 'A' <= r && r <= 'Z':
  688. return false
  689. case r == '_':
  690. return false
  691. }
  692. return true
  693. }
  694. // Letters and digits are not separators
  695. if unicode.IsLetter(r) || unicode.IsDigit(r) {
  696. return false
  697. }
  698. // Otherwise, all we can do for now is treat spaces as separators.
  699. return unicode.IsSpace(r)
  700. }
  701. // Title treats s as UTF-8-encoded bytes and returns a copy with all Unicode letters that begin
  702. // words mapped to their title case.
  703. //
  704. // Deprecated: The rule Title uses for word boundaries does not handle Unicode
  705. // punctuation properly. Use golang.org/x/text/cases instead.
  706. func Title(s []byte) []byte {
  707. // Use a closure here to remember state.
  708. // Hackish but effective. Depends on Map scanning in order and calling
  709. // the closure once per rune.
  710. prev := ' '
  711. return Map(
  712. func(r rune) rune {
  713. if isSeparator(prev) {
  714. prev = r
  715. return unicode.ToTitle(r)
  716. }
  717. prev = r
  718. return r
  719. },
  720. s)
  721. }
  722. // TrimLeftFunc treats s as UTF-8-encoded bytes and returns a subslice of s by slicing off
  723. // all leading UTF-8-encoded code points c that satisfy f(c).
  724. func TrimLeftFunc(s []byte, f func(r rune) bool) []byte {
  725. i := indexFunc(s, f, false)
  726. if i == -1 {
  727. return nil
  728. }
  729. return s[i:]
  730. }
  731. // TrimRightFunc returns a subslice of s by slicing off all trailing
  732. // UTF-8-encoded code points c that satisfy f(c).
  733. func TrimRightFunc(s []byte, f func(r rune) bool) []byte {
  734. i := lastIndexFunc(s, f, false)
  735. if i >= 0 && s[i] >= utf8.RuneSelf {
  736. _, wid := utf8.DecodeRune(s[i:])
  737. i += wid
  738. } else {
  739. i++
  740. }
  741. return s[0:i]
  742. }
  743. // TrimFunc returns a subslice of s by slicing off all leading and trailing
  744. // UTF-8-encoded code points c that satisfy f(c).
  745. func TrimFunc(s []byte, f func(r rune) bool) []byte {
  746. return TrimRightFunc(TrimLeftFunc(s, f), f)
  747. }
  748. // TrimPrefix returns s without the provided leading prefix string.
  749. // If s doesn't start with prefix, s is returned unchanged.
  750. func TrimPrefix(s, prefix []byte) []byte {
  751. if HasPrefix(s, prefix) {
  752. return s[len(prefix):]
  753. }
  754. return s
  755. }
  756. // TrimSuffix returns s without the provided trailing suffix string.
  757. // If s doesn't end with suffix, s is returned unchanged.
  758. func TrimSuffix(s, suffix []byte) []byte {
  759. if HasSuffix(s, suffix) {
  760. return s[:len(s)-len(suffix)]
  761. }
  762. return s
  763. }
  764. // IndexFunc interprets s as a sequence of UTF-8-encoded code points.
  765. // It returns the byte index in s of the first Unicode
  766. // code point satisfying f(c), or -1 if none do.
  767. func IndexFunc(s []byte, f func(r rune) bool) int {
  768. return indexFunc(s, f, true)
  769. }
  770. // LastIndexFunc interprets s as a sequence of UTF-8-encoded code points.
  771. // It returns the byte index in s of the last Unicode
  772. // code point satisfying f(c), or -1 if none do.
  773. func LastIndexFunc(s []byte, f func(r rune) bool) int {
  774. return lastIndexFunc(s, f, true)
  775. }
  776. // indexFunc is the same as IndexFunc except that if
  777. // truth==false, the sense of the predicate function is
  778. // inverted.
  779. func indexFunc(s []byte, f func(r rune) bool, truth bool) int {
  780. start := 0
  781. for start < len(s) {
  782. wid := 1
  783. r := rune(s[start])
  784. if r >= utf8.RuneSelf {
  785. r, wid = utf8.DecodeRune(s[start:])
  786. }
  787. if f(r) == truth {
  788. return start
  789. }
  790. start += wid
  791. }
  792. return -1
  793. }
  794. // lastIndexFunc is the same as LastIndexFunc except that if
  795. // truth==false, the sense of the predicate function is
  796. // inverted.
  797. func lastIndexFunc(s []byte, f func(r rune) bool, truth bool) int {
  798. for i := len(s); i > 0; {
  799. r, size := rune(s[i-1]), 1
  800. if r >= utf8.RuneSelf {
  801. r, size = utf8.DecodeLastRune(s[0:i])
  802. }
  803. i -= size
  804. if f(r) == truth {
  805. return i
  806. }
  807. }
  808. return -1
  809. }
  810. // asciiSet is a 32-byte value, where each bit represents the presence of a
  811. // given ASCII character in the set. The 128-bits of the lower 16 bytes,
  812. // starting with the least-significant bit of the lowest word to the
  813. // most-significant bit of the highest word, map to the full range of all
  814. // 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed,
  815. // ensuring that any non-ASCII character will be reported as not in the set.
  816. // This allocates a total of 32 bytes even though the upper half
  817. // is unused to avoid bounds checks in asciiSet.contains.
  818. type asciiSet [8]uint32
  819. // makeASCIISet creates a set of ASCII characters and reports whether all
  820. // characters in chars are ASCII.
  821. func makeASCIISet(chars string) (as asciiSet, ok bool) {
  822. for i := 0; i < len(chars); i++ {
  823. c := chars[i]
  824. if c >= utf8.RuneSelf {
  825. return as, false
  826. }
  827. as[c/32] |= 1 << (c % 32)
  828. }
  829. return as, true
  830. }
  831. // contains reports whether c is inside the set.
  832. func (as *asciiSet) contains(c byte) bool {
  833. return (as[c/32] & (1 << (c % 32))) != 0
  834. }
  835. // containsRune is a simplified version of strings.ContainsRune
  836. // to avoid importing the strings package.
  837. // We avoid bytes.ContainsRune to avoid allocating a temporary copy of s.
  838. func containsRune(s string, r rune) bool {
  839. for _, c := range s {
  840. if c == r {
  841. return true
  842. }
  843. }
  844. return false
  845. }
  846. // Trim returns a subslice of s by slicing off all leading and
  847. // trailing UTF-8-encoded code points contained in cutset.
  848. func Trim(s []byte, cutset string) []byte {
  849. if len(s) == 0 || cutset == "" {
  850. return s
  851. }
  852. if len(cutset) == 1 && cutset[0] < utf8.RuneSelf {
  853. return trimLeftByte(trimRightByte(s, cutset[0]), cutset[0])
  854. }
  855. if as, ok := makeASCIISet(cutset); ok {
  856. return trimLeftASCII(trimRightASCII(s, &as), &as)
  857. }
  858. return trimLeftUnicode(trimRightUnicode(s, cutset), cutset)
  859. }
  860. // TrimLeft returns a subslice of s by slicing off all leading
  861. // UTF-8-encoded code points contained in cutset.
  862. func TrimLeft(s []byte, cutset string) []byte {
  863. if len(s) == 0 || cutset == "" {
  864. return s
  865. }
  866. if len(cutset) == 1 && cutset[0] < utf8.RuneSelf {
  867. return trimLeftByte(s, cutset[0])
  868. }
  869. if as, ok := makeASCIISet(cutset); ok {
  870. return trimLeftASCII(s, &as)
  871. }
  872. return trimLeftUnicode(s, cutset)
  873. }
  874. func trimLeftByte(s []byte, c byte) []byte {
  875. for len(s) > 0 && s[0] == c {
  876. s = s[1:]
  877. }
  878. return s
  879. }
  880. func trimLeftASCII(s []byte, as *asciiSet) []byte {
  881. for len(s) > 0 {
  882. if !as.contains(s[0]) {
  883. break
  884. }
  885. s = s[1:]
  886. }
  887. return s
  888. }
  889. func trimLeftUnicode(s []byte, cutset string) []byte {
  890. for len(s) > 0 {
  891. r, n := rune(s[0]), 1
  892. if r >= utf8.RuneSelf {
  893. r, n = utf8.DecodeRune(s)
  894. }
  895. if !containsRune(cutset, r) {
  896. break
  897. }
  898. s = s[n:]
  899. }
  900. return s
  901. }
  902. // TrimRight returns a subslice of s by slicing off all trailing
  903. // UTF-8-encoded code points that are contained in cutset.
  904. func TrimRight(s []byte, cutset string) []byte {
  905. if len(s) == 0 || cutset == "" {
  906. return s
  907. }
  908. if len(cutset) == 1 && cutset[0] < utf8.RuneSelf {
  909. return trimRightByte(s, cutset[0])
  910. }
  911. if as, ok := makeASCIISet(cutset); ok {
  912. return trimRightASCII(s, &as)
  913. }
  914. return trimRightUnicode(s, cutset)
  915. }
  916. func trimRightByte(s []byte, c byte) []byte {
  917. for len(s) > 0 && s[len(s)-1] == c {
  918. s = s[:len(s)-1]
  919. }
  920. return s
  921. }
  922. func trimRightASCII(s []byte, as *asciiSet) []byte {
  923. for len(s) > 0 {
  924. if !as.contains(s[len(s)-1]) {
  925. break
  926. }
  927. s = s[:len(s)-1]
  928. }
  929. return s
  930. }
  931. func trimRightUnicode(s []byte, cutset string) []byte {
  932. for len(s) > 0 {
  933. r, n := rune(s[len(s)-1]), 1
  934. if r >= utf8.RuneSelf {
  935. r, n = utf8.DecodeLastRune(s)
  936. }
  937. if !containsRune(cutset, r) {
  938. break
  939. }
  940. s = s[:len(s)-n]
  941. }
  942. return s
  943. }
  944. // TrimSpace returns a subslice of s by slicing off all leading and
  945. // trailing white space, as defined by Unicode.
  946. func TrimSpace(s []byte) []byte {
  947. // Fast path for ASCII: look for the first ASCII non-space byte
  948. start := 0
  949. for ; start < len(s); start++ {
  950. c := s[start]
  951. if c >= utf8.RuneSelf {
  952. // If we run into a non-ASCII byte, fall back to the
  953. // slower unicode-aware method on the remaining bytes
  954. return TrimFunc(s[start:], unicode.IsSpace)
  955. }
  956. if asciiSpace[c] == 0 {
  957. break
  958. }
  959. }
  960. // Now look for the first ASCII non-space byte from the end
  961. stop := len(s)
  962. for ; stop > start; stop-- {
  963. c := s[stop-1]
  964. if c >= utf8.RuneSelf {
  965. return TrimFunc(s[start:stop], unicode.IsSpace)
  966. }
  967. if asciiSpace[c] == 0 {
  968. break
  969. }
  970. }
  971. // At this point s[start:stop] starts and ends with an ASCII
  972. // non-space bytes, so we're done. Non-ASCII cases have already
  973. // been handled above.
  974. if start == stop {
  975. // Special case to preserve previous TrimLeftFunc behavior,
  976. // returning nil instead of empty slice if all spaces.
  977. return nil
  978. }
  979. return s[start:stop]
  980. }
  981. // Runes interprets s as a sequence of UTF-8-encoded code points.
  982. // It returns a slice of runes (Unicode code points) equivalent to s.
  983. func Runes(s []byte) []rune {
  984. t := make([]rune, utf8.RuneCount(s))
  985. i := 0
  986. for len(s) > 0 {
  987. r, l := utf8.DecodeRune(s)
  988. t[i] = r
  989. i++
  990. s = s[l:]
  991. }
  992. return t
  993. }
  994. // Replace returns a copy of the slice s with the first n
  995. // non-overlapping instances of old replaced by new.
  996. // If old is empty, it matches at the beginning of the slice
  997. // and after each UTF-8 sequence, yielding up to k+1 replacements
  998. // for a k-rune slice.
  999. // If n < 0, there is no limit on the number of replacements.
  1000. func Replace(s, old, new []byte, n int) []byte {
  1001. m := 0
  1002. if n != 0 {
  1003. // Compute number of replacements.
  1004. m = Count(s, old)
  1005. }
  1006. if m == 0 {
  1007. // Just return a copy.
  1008. return append([]byte(nil), s...)
  1009. }
  1010. if n < 0 || m < n {
  1011. n = m
  1012. }
  1013. // Apply replacements to buffer.
  1014. t := make([]byte, len(s)+n*(len(new)-len(old)))
  1015. w := 0
  1016. start := 0
  1017. for i := 0; i < n; i++ {
  1018. j := start
  1019. if len(old) == 0 {
  1020. if i > 0 {
  1021. _, wid := utf8.DecodeRune(s[start:])
  1022. j += wid
  1023. }
  1024. } else {
  1025. j += Index(s[start:], old)
  1026. }
  1027. w += copy(t[w:], s[start:j])
  1028. w += copy(t[w:], new)
  1029. start = j + len(old)
  1030. }
  1031. w += copy(t[w:], s[start:])
  1032. return t[0:w]
  1033. }
  1034. // ReplaceAll returns a copy of the slice s with all
  1035. // non-overlapping instances of old replaced by new.
  1036. // If old is empty, it matches at the beginning of the slice
  1037. // and after each UTF-8 sequence, yielding up to k+1 replacements
  1038. // for a k-rune slice.
  1039. func ReplaceAll(s, old, new []byte) []byte {
  1040. return Replace(s, old, new, -1)
  1041. }
  1042. // EqualFold reports whether s and t, interpreted as UTF-8 strings,
  1043. // are equal under Unicode case-folding, which is a more general
  1044. // form of case-insensitivity.
  1045. func EqualFold(s, t []byte) bool {
  1046. for len(s) != 0 && len(t) != 0 {
  1047. // Extract first rune from each.
  1048. var sr, tr rune
  1049. if s[0] < utf8.RuneSelf {
  1050. sr, s = rune(s[0]), s[1:]
  1051. } else {
  1052. r, size := utf8.DecodeRune(s)
  1053. sr, s = r, s[size:]
  1054. }
  1055. if t[0] < utf8.RuneSelf {
  1056. tr, t = rune(t[0]), t[1:]
  1057. } else {
  1058. r, size := utf8.DecodeRune(t)
  1059. tr, t = r, t[size:]
  1060. }
  1061. // If they match, keep going; if not, return false.
  1062. // Easy case.
  1063. if tr == sr {
  1064. continue
  1065. }
  1066. // Make sr < tr to simplify what follows.
  1067. if tr < sr {
  1068. tr, sr = sr, tr
  1069. }
  1070. // Fast check for ASCII.
  1071. if tr < utf8.RuneSelf {
  1072. // ASCII only, sr/tr must be upper/lower case
  1073. if 'A' <= sr && sr <= 'Z' && tr == sr+'a'-'A' {
  1074. continue
  1075. }
  1076. return false
  1077. }
  1078. // General case. SimpleFold(x) returns the next equivalent rune > x
  1079. // or wraps around to smaller values.
  1080. r := unicode.SimpleFold(sr)
  1081. for r != sr && r < tr {
  1082. r = unicode.SimpleFold(r)
  1083. }
  1084. if r == tr {
  1085. continue
  1086. }
  1087. return false
  1088. }
  1089. // One string is empty. Are both?
  1090. return len(s) == len(t)
  1091. }
  1092. // Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
  1093. func Index(s, sep []byte) int {
  1094. n := len(sep)
  1095. switch {
  1096. case n == 0:
  1097. return 0
  1098. case n == 1:
  1099. return IndexByte(s, sep[0])
  1100. case n == len(s):
  1101. if Equal(sep, s) {
  1102. return 0
  1103. }
  1104. return -1
  1105. case n > len(s):
  1106. return -1
  1107. case n <= bytealg.MaxLen:
  1108. // Use brute force when s and sep both are small
  1109. if len(s) <= bytealg.MaxBruteForce {
  1110. return bytealg.Index(s, sep)
  1111. }
  1112. c0 := sep[0]
  1113. c1 := sep[1]
  1114. i := 0
  1115. t := len(s) - n + 1
  1116. fails := 0
  1117. for i < t {
  1118. if s[i] != c0 {
  1119. // IndexByte is faster than bytealg.Index, so use it as long as
  1120. // we're not getting lots of false positives.
  1121. o := IndexByte(s[i+1:t], c0)
  1122. if o < 0 {
  1123. return -1
  1124. }
  1125. i += o + 1
  1126. }
  1127. if s[i+1] == c1 && Equal(s[i:i+n], sep) {
  1128. return i
  1129. }
  1130. fails++
  1131. i++
  1132. // Switch to bytealg.Index when IndexByte produces too many false positives.
  1133. if fails > bytealg.Cutover(i) {
  1134. r := bytealg.Index(s[i:], sep)
  1135. if r >= 0 {
  1136. return r + i
  1137. }
  1138. return -1
  1139. }
  1140. }
  1141. return -1
  1142. }
  1143. c0 := sep[0]
  1144. c1 := sep[1]
  1145. i := 0
  1146. fails := 0
  1147. t := len(s) - n + 1
  1148. for i < t {
  1149. if s[i] != c0 {
  1150. o := IndexByte(s[i+1:t], c0)
  1151. if o < 0 {
  1152. break
  1153. }
  1154. i += o + 1
  1155. }
  1156. if s[i+1] == c1 && Equal(s[i:i+n], sep) {
  1157. return i
  1158. }
  1159. i++
  1160. fails++
  1161. if fails >= 4+i>>4 && i < t {
  1162. // Give up on IndexByte, it isn't skipping ahead
  1163. // far enough to be better than Rabin-Karp.
  1164. // Experiments (using IndexPeriodic) suggest
  1165. // the cutover is about 16 byte skips.
  1166. // TODO: if large prefixes of sep are matching
  1167. // we should cutover at even larger average skips,
  1168. // because Equal becomes that much more expensive.
  1169. // This code does not take that effect into account.
  1170. j := bytealg.IndexRabinKarpBytes(s[i:], sep)
  1171. if j < 0 {
  1172. return -1
  1173. }
  1174. return i + j
  1175. }
  1176. }
  1177. return -1
  1178. }
  1179. // Cut slices s around the first instance of sep,
  1180. // returning the text before and after sep.
  1181. // The found result reports whether sep appears in s.
  1182. // If sep does not appear in s, cut returns s, nil, false.
  1183. //
  1184. // Cut returns slices of the original slice s, not copies.
  1185. func Cut(s, sep []byte) (before, after []byte, found bool) {
  1186. if i := Index(s, sep); i >= 0 {
  1187. return s[:i], s[i+len(sep):], true
  1188. }
  1189. return s, nil, false
  1190. }