1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276 |
- // Copyright 2009 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- // Package bytes implements functions for the manipulation of byte slices.
- // It is analogous to the facilities of the strings package.
- package bytes
- import (
- "internal/bytealg"
- "unicode"
- "unicode/utf8"
- )
- // Equal reports whether a and b
- // are the same length and contain the same bytes.
- // A nil argument is equivalent to an empty slice.
- func Equal(a, b []byte) bool {
- // Neither cmd/compile nor gccgo allocates for these string conversions.
- return string(a) == string(b)
- }
- // Compare returns an integer comparing two byte slices lexicographically.
- // The result will be 0 if a == b, -1 if a < b, and +1 if a > b.
- // A nil argument is equivalent to an empty slice.
- func Compare(a, b []byte) int {
- return bytealg.Compare(a, b)
- }
- // explode splits s into a slice of UTF-8 sequences, one per Unicode code point (still slices of bytes),
- // up to a maximum of n byte slices. Invalid UTF-8 sequences are chopped into individual bytes.
- func explode(s []byte, n int) [][]byte {
- if n <= 0 {
- n = len(s)
- }
- a := make([][]byte, n)
- var size int
- na := 0
- for len(s) > 0 {
- if na+1 >= n {
- a[na] = s
- na++
- break
- }
- _, size = utf8.DecodeRune(s)
- a[na] = s[0:size:size]
- s = s[size:]
- na++
- }
- return a[0:na]
- }
- // Count counts the number of non-overlapping instances of sep in s.
- // If sep is an empty slice, Count returns 1 + the number of UTF-8-encoded code points in s.
- func Count(s, sep []byte) int {
- // special case
- if len(sep) == 0 {
- return utf8.RuneCount(s) + 1
- }
- if len(sep) == 1 {
- return bytealg.Count(s, sep[0])
- }
- n := 0
- for {
- i := Index(s, sep)
- if i == -1 {
- return n
- }
- n++
- s = s[i+len(sep):]
- }
- }
- // Contains reports whether subslice is within b.
- func Contains(b, subslice []byte) bool {
- return Index(b, subslice) != -1
- }
- // ContainsAny reports whether any of the UTF-8-encoded code points in chars are within b.
- func ContainsAny(b []byte, chars string) bool {
- return IndexAny(b, chars) >= 0
- }
- // ContainsRune reports whether the rune is contained in the UTF-8-encoded byte slice b.
- func ContainsRune(b []byte, r rune) bool {
- return IndexRune(b, r) >= 0
- }
- // IndexByte returns the index of the first instance of c in b, or -1 if c is not present in b.
- func IndexByte(b []byte, c byte) int {
- return bytealg.IndexByte(b, c)
- }
- func indexBytePortable(s []byte, c byte) int {
- for i, b := range s {
- if b == c {
- return i
- }
- }
- return -1
- }
- // LastIndex returns the index of the last instance of sep in s, or -1 if sep is not present in s.
- func LastIndex(s, sep []byte) int {
- n := len(sep)
- switch {
- case n == 0:
- return len(s)
- case n == 1:
- return LastIndexByte(s, sep[0])
- case n == len(s):
- if Equal(s, sep) {
- return 0
- }
- return -1
- case n > len(s):
- return -1
- }
- // Rabin-Karp search from the end of the string
- hashss, pow := bytealg.HashStrRevBytes(sep)
- last := len(s) - n
- var h uint32
- for i := len(s) - 1; i >= last; i-- {
- h = h*bytealg.PrimeRK + uint32(s[i])
- }
- if h == hashss && Equal(s[last:], sep) {
- return last
- }
- for i := last - 1; i >= 0; i-- {
- h *= bytealg.PrimeRK
- h += uint32(s[i])
- h -= pow * uint32(s[i+n])
- if h == hashss && Equal(s[i:i+n], sep) {
- return i
- }
- }
- return -1
- }
- // LastIndexByte returns the index of the last instance of c in s, or -1 if c is not present in s.
- func LastIndexByte(s []byte, c byte) int {
- for i := len(s) - 1; i >= 0; i-- {
- if s[i] == c {
- return i
- }
- }
- return -1
- }
- // IndexRune interprets s as a sequence of UTF-8-encoded code points.
- // It returns the byte index of the first occurrence in s of the given rune.
- // It returns -1 if rune is not present in s.
- // If r is utf8.RuneError, it returns the first instance of any
- // invalid UTF-8 byte sequence.
- func IndexRune(s []byte, r rune) int {
- switch {
- case 0 <= r && r < utf8.RuneSelf:
- return IndexByte(s, byte(r))
- case r == utf8.RuneError:
- for i := 0; i < len(s); {
- r1, n := utf8.DecodeRune(s[i:])
- if r1 == utf8.RuneError {
- return i
- }
- i += n
- }
- return -1
- case !utf8.ValidRune(r):
- return -1
- default:
- var b [utf8.UTFMax]byte
- n := utf8.EncodeRune(b[:], r)
- return Index(s, b[:n])
- }
- }
- // IndexAny interprets s as a sequence of UTF-8-encoded Unicode code points.
- // It returns the byte index of the first occurrence in s of any of the Unicode
- // code points in chars. It returns -1 if chars is empty or if there is no code
- // point in common.
- func IndexAny(s []byte, chars string) int {
- if chars == "" {
- // Avoid scanning all of s.
- return -1
- }
- if len(s) == 1 {
- r := rune(s[0])
- if r >= utf8.RuneSelf {
- // search utf8.RuneError.
- for _, r = range chars {
- if r == utf8.RuneError {
- return 0
- }
- }
- return -1
- }
- if bytealg.IndexByteString(chars, s[0]) >= 0 {
- return 0
- }
- return -1
- }
- if len(chars) == 1 {
- r := rune(chars[0])
- if r >= utf8.RuneSelf {
- r = utf8.RuneError
- }
- return IndexRune(s, r)
- }
- if len(s) > 8 {
- if as, isASCII := makeASCIISet(chars); isASCII {
- for i, c := range s {
- if as.contains(c) {
- return i
- }
- }
- return -1
- }
- }
- var width int
- for i := 0; i < len(s); i += width {
- r := rune(s[i])
- if r < utf8.RuneSelf {
- if bytealg.IndexByteString(chars, s[i]) >= 0 {
- return i
- }
- width = 1
- continue
- }
- r, width = utf8.DecodeRune(s[i:])
- if r != utf8.RuneError {
- // r is 2 to 4 bytes
- if len(chars) == width {
- if chars == string(r) {
- return i
- }
- continue
- }
- // Use bytealg.IndexString for performance if available.
- if bytealg.MaxLen >= width {
- if bytealg.IndexString(chars, string(r)) >= 0 {
- return i
- }
- continue
- }
- }
- for _, ch := range chars {
- if r == ch {
- return i
- }
- }
- }
- return -1
- }
- // LastIndexAny interprets s as a sequence of UTF-8-encoded Unicode code
- // points. It returns the byte index of the last occurrence in s of any of
- // the Unicode code points in chars. It returns -1 if chars is empty or if
- // there is no code point in common.
- func LastIndexAny(s []byte, chars string) int {
- if chars == "" {
- // Avoid scanning all of s.
- return -1
- }
- if len(s) > 8 {
- if as, isASCII := makeASCIISet(chars); isASCII {
- for i := len(s) - 1; i >= 0; i-- {
- if as.contains(s[i]) {
- return i
- }
- }
- return -1
- }
- }
- if len(s) == 1 {
- r := rune(s[0])
- if r >= utf8.RuneSelf {
- for _, r = range chars {
- if r == utf8.RuneError {
- return 0
- }
- }
- return -1
- }
- if bytealg.IndexByteString(chars, s[0]) >= 0 {
- return 0
- }
- return -1
- }
- if len(chars) == 1 {
- cr := rune(chars[0])
- if cr >= utf8.RuneSelf {
- cr = utf8.RuneError
- }
- for i := len(s); i > 0; {
- r, size := utf8.DecodeLastRune(s[:i])
- i -= size
- if r == cr {
- return i
- }
- }
- return -1
- }
- for i := len(s); i > 0; {
- r := rune(s[i-1])
- if r < utf8.RuneSelf {
- if bytealg.IndexByteString(chars, s[i-1]) >= 0 {
- return i - 1
- }
- i--
- continue
- }
- r, size := utf8.DecodeLastRune(s[:i])
- i -= size
- if r != utf8.RuneError {
- // r is 2 to 4 bytes
- if len(chars) == size {
- if chars == string(r) {
- return i
- }
- continue
- }
- // Use bytealg.IndexString for performance if available.
- if bytealg.MaxLen >= size {
- if bytealg.IndexString(chars, string(r)) >= 0 {
- return i
- }
- continue
- }
- }
- for _, ch := range chars {
- if r == ch {
- return i
- }
- }
- }
- return -1
- }
- // Generic split: splits after each instance of sep,
- // including sepSave bytes of sep in the subslices.
- func genSplit(s, sep []byte, sepSave, n int) [][]byte {
- if n == 0 {
- return nil
- }
- if len(sep) == 0 {
- return explode(s, n)
- }
- if n < 0 {
- n = Count(s, sep) + 1
- }
- a := make([][]byte, n)
- n--
- i := 0
- for i < n {
- m := Index(s, sep)
- if m < 0 {
- break
- }
- a[i] = s[: m+sepSave : m+sepSave]
- s = s[m+len(sep):]
- i++
- }
- a[i] = s
- return a[:i+1]
- }
- // SplitN slices s into subslices separated by sep and returns a slice of
- // the subslices between those separators.
- // If sep is empty, SplitN splits after each UTF-8 sequence.
- // The count determines the number of subslices to return:
- // n > 0: at most n subslices; the last subslice will be the unsplit remainder.
- // n == 0: the result is nil (zero subslices)
- // n < 0: all subslices
- //
- // To split around the first instance of a separator, see Cut.
- func SplitN(s, sep []byte, n int) [][]byte { return genSplit(s, sep, 0, n) }
- // SplitAfterN slices s into subslices after each instance of sep and
- // returns a slice of those subslices.
- // If sep is empty, SplitAfterN splits after each UTF-8 sequence.
- // The count determines the number of subslices to return:
- // n > 0: at most n subslices; the last subslice will be the unsplit remainder.
- // n == 0: the result is nil (zero subslices)
- // n < 0: all subslices
- func SplitAfterN(s, sep []byte, n int) [][]byte {
- return genSplit(s, sep, len(sep), n)
- }
- // Split slices s into all subslices separated by sep and returns a slice of
- // the subslices between those separators.
- // If sep is empty, Split splits after each UTF-8 sequence.
- // It is equivalent to SplitN with a count of -1.
- //
- // To split around the first instance of a separator, see Cut.
- func Split(s, sep []byte) [][]byte { return genSplit(s, sep, 0, -1) }
- // SplitAfter slices s into all subslices after each instance of sep and
- // returns a slice of those subslices.
- // If sep is empty, SplitAfter splits after each UTF-8 sequence.
- // It is equivalent to SplitAfterN with a count of -1.
- func SplitAfter(s, sep []byte) [][]byte {
- return genSplit(s, sep, len(sep), -1)
- }
- var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1}
- // Fields interprets s as a sequence of UTF-8-encoded code points.
- // It splits the slice s around each instance of one or more consecutive white space
- // characters, as defined by unicode.IsSpace, returning a slice of subslices of s or an
- // empty slice if s contains only white space.
- func Fields(s []byte) [][]byte {
- // First count the fields.
- // This is an exact count if s is ASCII, otherwise it is an approximation.
- n := 0
- wasSpace := 1
- // setBits is used to track which bits are set in the bytes of s.
- setBits := uint8(0)
- for i := 0; i < len(s); i++ {
- r := s[i]
- setBits |= r
- isSpace := int(asciiSpace[r])
- n += wasSpace & ^isSpace
- wasSpace = isSpace
- }
- if setBits >= utf8.RuneSelf {
- // Some runes in the input slice are not ASCII.
- return FieldsFunc(s, unicode.IsSpace)
- }
- // ASCII fast path
- a := make([][]byte, n)
- na := 0
- fieldStart := 0
- i := 0
- // Skip spaces in the front of the input.
- for i < len(s) && asciiSpace[s[i]] != 0 {
- i++
- }
- fieldStart = i
- for i < len(s) {
- if asciiSpace[s[i]] == 0 {
- i++
- continue
- }
- a[na] = s[fieldStart:i:i]
- na++
- i++
- // Skip spaces in between fields.
- for i < len(s) && asciiSpace[s[i]] != 0 {
- i++
- }
- fieldStart = i
- }
- if fieldStart < len(s) { // Last field might end at EOF.
- a[na] = s[fieldStart:len(s):len(s)]
- }
- return a
- }
- // FieldsFunc interprets s as a sequence of UTF-8-encoded code points.
- // It splits the slice s at each run of code points c satisfying f(c) and
- // returns a slice of subslices of s. If all code points in s satisfy f(c), or
- // len(s) == 0, an empty slice is returned.
- //
- // FieldsFunc makes no guarantees about the order in which it calls f(c)
- // and assumes that f always returns the same value for a given c.
- func FieldsFunc(s []byte, f func(rune) bool) [][]byte {
- // A span is used to record a slice of s of the form s[start:end].
- // The start index is inclusive and the end index is exclusive.
- type span struct {
- start int
- end int
- }
- spans := make([]span, 0, 32)
- // Find the field start and end indices.
- // Doing this in a separate pass (rather than slicing the string s
- // and collecting the result substrings right away) is significantly
- // more efficient, possibly due to cache effects.
- start := -1 // valid span start if >= 0
- for i := 0; i < len(s); {
- size := 1
- r := rune(s[i])
- if r >= utf8.RuneSelf {
- r, size = utf8.DecodeRune(s[i:])
- }
- if f(r) {
- if start >= 0 {
- spans = append(spans, span{start, i})
- start = -1
- }
- } else {
- if start < 0 {
- start = i
- }
- }
- i += size
- }
- // Last field might end at EOF.
- if start >= 0 {
- spans = append(spans, span{start, len(s)})
- }
- // Create subslices from recorded field indices.
- a := make([][]byte, len(spans))
- for i, span := range spans {
- a[i] = s[span.start:span.end:span.end]
- }
- return a
- }
- // Join concatenates the elements of s to create a new byte slice. The separator
- // sep is placed between elements in the resulting slice.
- func Join(s [][]byte, sep []byte) []byte {
- if len(s) == 0 {
- return []byte{}
- }
- if len(s) == 1 {
- // Just return a copy.
- return append([]byte(nil), s[0]...)
- }
- n := len(sep) * (len(s) - 1)
- for _, v := range s {
- n += len(v)
- }
- b := make([]byte, n)
- bp := copy(b, s[0])
- for _, v := range s[1:] {
- bp += copy(b[bp:], sep)
- bp += copy(b[bp:], v)
- }
- return b
- }
- // HasPrefix tests whether the byte slice s begins with prefix.
- func HasPrefix(s, prefix []byte) bool {
- return len(s) >= len(prefix) && Equal(s[0:len(prefix)], prefix)
- }
- // HasSuffix tests whether the byte slice s ends with suffix.
- func HasSuffix(s, suffix []byte) bool {
- return len(s) >= len(suffix) && Equal(s[len(s)-len(suffix):], suffix)
- }
- // Map returns a copy of the byte slice s with all its characters modified
- // according to the mapping function. If mapping returns a negative value, the character is
- // dropped from the byte slice with no replacement. The characters in s and the
- // output are interpreted as UTF-8-encoded code points.
- func Map(mapping func(r rune) rune, s []byte) []byte {
- // In the worst case, the slice can grow when mapped, making
- // things unpleasant. But it's so rare we barge in assuming it's
- // fine. It could also shrink but that falls out naturally.
- maxbytes := len(s) // length of b
- nbytes := 0 // number of bytes encoded in b
- b := make([]byte, maxbytes)
- for i := 0; i < len(s); {
- wid := 1
- r := rune(s[i])
- if r >= utf8.RuneSelf {
- r, wid = utf8.DecodeRune(s[i:])
- }
- r = mapping(r)
- if r >= 0 {
- rl := utf8.RuneLen(r)
- if rl < 0 {
- rl = len(string(utf8.RuneError))
- }
- if nbytes+rl > maxbytes {
- // Grow the buffer.
- maxbytes = maxbytes*2 + utf8.UTFMax
- nb := make([]byte, maxbytes)
- copy(nb, b[0:nbytes])
- b = nb
- }
- nbytes += utf8.EncodeRune(b[nbytes:maxbytes], r)
- }
- i += wid
- }
- return b[0:nbytes]
- }
- // Repeat returns a new byte slice consisting of count copies of b.
- //
- // It panics if count is negative or if
- // the result of (len(b) * count) overflows.
- func Repeat(b []byte, count int) []byte {
- if count == 0 {
- return []byte{}
- }
- // Since we cannot return an error on overflow,
- // we should panic if the repeat will generate
- // an overflow.
- // See Issue golang.org/issue/16237.
- if count < 0 {
- panic("bytes: negative Repeat count")
- } else if len(b)*count/count != len(b) {
- panic("bytes: Repeat count causes overflow")
- }
- nb := make([]byte, len(b)*count)
- bp := copy(nb, b)
- for bp < len(nb) {
- copy(nb[bp:], nb[:bp])
- bp *= 2
- }
- return nb
- }
- // ToUpper returns a copy of the byte slice s with all Unicode letters mapped to
- // their upper case.
- func ToUpper(s []byte) []byte {
- isASCII, hasLower := true, false
- for i := 0; i < len(s); i++ {
- c := s[i]
- if c >= utf8.RuneSelf {
- isASCII = false
- break
- }
- hasLower = hasLower || ('a' <= c && c <= 'z')
- }
- if isASCII { // optimize for ASCII-only byte slices.
- if !hasLower {
- // Just return a copy.
- return append([]byte(""), s...)
- }
- b := make([]byte, len(s))
- for i := 0; i < len(s); i++ {
- c := s[i]
- if 'a' <= c && c <= 'z' {
- c -= 'a' - 'A'
- }
- b[i] = c
- }
- return b
- }
- return Map(unicode.ToUpper, s)
- }
- // ToLower returns a copy of the byte slice s with all Unicode letters mapped to
- // their lower case.
- func ToLower(s []byte) []byte {
- isASCII, hasUpper := true, false
- for i := 0; i < len(s); i++ {
- c := s[i]
- if c >= utf8.RuneSelf {
- isASCII = false
- break
- }
- hasUpper = hasUpper || ('A' <= c && c <= 'Z')
- }
- if isASCII { // optimize for ASCII-only byte slices.
- if !hasUpper {
- return append([]byte(""), s...)
- }
- b := make([]byte, len(s))
- for i := 0; i < len(s); i++ {
- c := s[i]
- if 'A' <= c && c <= 'Z' {
- c += 'a' - 'A'
- }
- b[i] = c
- }
- return b
- }
- return Map(unicode.ToLower, s)
- }
- // ToTitle treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their title case.
- func ToTitle(s []byte) []byte { return Map(unicode.ToTitle, s) }
- // ToUpperSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their
- // upper case, giving priority to the special casing rules.
- func ToUpperSpecial(c unicode.SpecialCase, s []byte) []byte {
- return Map(c.ToUpper, s)
- }
- // ToLowerSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their
- // lower case, giving priority to the special casing rules.
- func ToLowerSpecial(c unicode.SpecialCase, s []byte) []byte {
- return Map(c.ToLower, s)
- }
- // ToTitleSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their
- // title case, giving priority to the special casing rules.
- func ToTitleSpecial(c unicode.SpecialCase, s []byte) []byte {
- return Map(c.ToTitle, s)
- }
- // ToValidUTF8 treats s as UTF-8-encoded bytes and returns a copy with each run of bytes
- // representing invalid UTF-8 replaced with the bytes in replacement, which may be empty.
- func ToValidUTF8(s, replacement []byte) []byte {
- b := make([]byte, 0, len(s)+len(replacement))
- invalid := false // previous byte was from an invalid UTF-8 sequence
- for i := 0; i < len(s); {
- c := s[i]
- if c < utf8.RuneSelf {
- i++
- invalid = false
- b = append(b, c)
- continue
- }
- _, wid := utf8.DecodeRune(s[i:])
- if wid == 1 {
- i++
- if !invalid {
- invalid = true
- b = append(b, replacement...)
- }
- continue
- }
- invalid = false
- b = append(b, s[i:i+wid]...)
- i += wid
- }
- return b
- }
- // isSeparator reports whether the rune could mark a word boundary.
- // TODO: update when package unicode captures more of the properties.
- func isSeparator(r rune) bool {
- // ASCII alphanumerics and underscore are not separators
- if r <= 0x7F {
- switch {
- case '0' <= r && r <= '9':
- return false
- case 'a' <= r && r <= 'z':
- return false
- case 'A' <= r && r <= 'Z':
- return false
- case r == '_':
- return false
- }
- return true
- }
- // Letters and digits are not separators
- if unicode.IsLetter(r) || unicode.IsDigit(r) {
- return false
- }
- // Otherwise, all we can do for now is treat spaces as separators.
- return unicode.IsSpace(r)
- }
- // Title treats s as UTF-8-encoded bytes and returns a copy with all Unicode letters that begin
- // words mapped to their title case.
- //
- // Deprecated: The rule Title uses for word boundaries does not handle Unicode
- // punctuation properly. Use golang.org/x/text/cases instead.
- func Title(s []byte) []byte {
- // Use a closure here to remember state.
- // Hackish but effective. Depends on Map scanning in order and calling
- // the closure once per rune.
- prev := ' '
- return Map(
- func(r rune) rune {
- if isSeparator(prev) {
- prev = r
- return unicode.ToTitle(r)
- }
- prev = r
- return r
- },
- s)
- }
- // TrimLeftFunc treats s as UTF-8-encoded bytes and returns a subslice of s by slicing off
- // all leading UTF-8-encoded code points c that satisfy f(c).
- func TrimLeftFunc(s []byte, f func(r rune) bool) []byte {
- i := indexFunc(s, f, false)
- if i == -1 {
- return nil
- }
- return s[i:]
- }
- // TrimRightFunc returns a subslice of s by slicing off all trailing
- // UTF-8-encoded code points c that satisfy f(c).
- func TrimRightFunc(s []byte, f func(r rune) bool) []byte {
- i := lastIndexFunc(s, f, false)
- if i >= 0 && s[i] >= utf8.RuneSelf {
- _, wid := utf8.DecodeRune(s[i:])
- i += wid
- } else {
- i++
- }
- return s[0:i]
- }
- // TrimFunc returns a subslice of s by slicing off all leading and trailing
- // UTF-8-encoded code points c that satisfy f(c).
- func TrimFunc(s []byte, f func(r rune) bool) []byte {
- return TrimRightFunc(TrimLeftFunc(s, f), f)
- }
- // TrimPrefix returns s without the provided leading prefix string.
- // If s doesn't start with prefix, s is returned unchanged.
- func TrimPrefix(s, prefix []byte) []byte {
- if HasPrefix(s, prefix) {
- return s[len(prefix):]
- }
- return s
- }
- // TrimSuffix returns s without the provided trailing suffix string.
- // If s doesn't end with suffix, s is returned unchanged.
- func TrimSuffix(s, suffix []byte) []byte {
- if HasSuffix(s, suffix) {
- return s[:len(s)-len(suffix)]
- }
- return s
- }
- // IndexFunc interprets s as a sequence of UTF-8-encoded code points.
- // It returns the byte index in s of the first Unicode
- // code point satisfying f(c), or -1 if none do.
- func IndexFunc(s []byte, f func(r rune) bool) int {
- return indexFunc(s, f, true)
- }
- // LastIndexFunc interprets s as a sequence of UTF-8-encoded code points.
- // It returns the byte index in s of the last Unicode
- // code point satisfying f(c), or -1 if none do.
- func LastIndexFunc(s []byte, f func(r rune) bool) int {
- return lastIndexFunc(s, f, true)
- }
- // indexFunc is the same as IndexFunc except that if
- // truth==false, the sense of the predicate function is
- // inverted.
- func indexFunc(s []byte, f func(r rune) bool, truth bool) int {
- start := 0
- for start < len(s) {
- wid := 1
- r := rune(s[start])
- if r >= utf8.RuneSelf {
- r, wid = utf8.DecodeRune(s[start:])
- }
- if f(r) == truth {
- return start
- }
- start += wid
- }
- return -1
- }
- // lastIndexFunc is the same as LastIndexFunc except that if
- // truth==false, the sense of the predicate function is
- // inverted.
- func lastIndexFunc(s []byte, f func(r rune) bool, truth bool) int {
- for i := len(s); i > 0; {
- r, size := rune(s[i-1]), 1
- if r >= utf8.RuneSelf {
- r, size = utf8.DecodeLastRune(s[0:i])
- }
- i -= size
- if f(r) == truth {
- return i
- }
- }
- return -1
- }
- // asciiSet is a 32-byte value, where each bit represents the presence of a
- // given ASCII character in the set. The 128-bits of the lower 16 bytes,
- // starting with the least-significant bit of the lowest word to the
- // most-significant bit of the highest word, map to the full range of all
- // 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed,
- // ensuring that any non-ASCII character will be reported as not in the set.
- // This allocates a total of 32 bytes even though the upper half
- // is unused to avoid bounds checks in asciiSet.contains.
- type asciiSet [8]uint32
- // makeASCIISet creates a set of ASCII characters and reports whether all
- // characters in chars are ASCII.
- func makeASCIISet(chars string) (as asciiSet, ok bool) {
- for i := 0; i < len(chars); i++ {
- c := chars[i]
- if c >= utf8.RuneSelf {
- return as, false
- }
- as[c/32] |= 1 << (c % 32)
- }
- return as, true
- }
- // contains reports whether c is inside the set.
- func (as *asciiSet) contains(c byte) bool {
- return (as[c/32] & (1 << (c % 32))) != 0
- }
- // containsRune is a simplified version of strings.ContainsRune
- // to avoid importing the strings package.
- // We avoid bytes.ContainsRune to avoid allocating a temporary copy of s.
- func containsRune(s string, r rune) bool {
- for _, c := range s {
- if c == r {
- return true
- }
- }
- return false
- }
- // Trim returns a subslice of s by slicing off all leading and
- // trailing UTF-8-encoded code points contained in cutset.
- func Trim(s []byte, cutset string) []byte {
- if len(s) == 0 || cutset == "" {
- return s
- }
- if len(cutset) == 1 && cutset[0] < utf8.RuneSelf {
- return trimLeftByte(trimRightByte(s, cutset[0]), cutset[0])
- }
- if as, ok := makeASCIISet(cutset); ok {
- return trimLeftASCII(trimRightASCII(s, &as), &as)
- }
- return trimLeftUnicode(trimRightUnicode(s, cutset), cutset)
- }
- // TrimLeft returns a subslice of s by slicing off all leading
- // UTF-8-encoded code points contained in cutset.
- func TrimLeft(s []byte, cutset string) []byte {
- if len(s) == 0 || cutset == "" {
- return s
- }
- if len(cutset) == 1 && cutset[0] < utf8.RuneSelf {
- return trimLeftByte(s, cutset[0])
- }
- if as, ok := makeASCIISet(cutset); ok {
- return trimLeftASCII(s, &as)
- }
- return trimLeftUnicode(s, cutset)
- }
- func trimLeftByte(s []byte, c byte) []byte {
- for len(s) > 0 && s[0] == c {
- s = s[1:]
- }
- return s
- }
- func trimLeftASCII(s []byte, as *asciiSet) []byte {
- for len(s) > 0 {
- if !as.contains(s[0]) {
- break
- }
- s = s[1:]
- }
- return s
- }
- func trimLeftUnicode(s []byte, cutset string) []byte {
- for len(s) > 0 {
- r, n := rune(s[0]), 1
- if r >= utf8.RuneSelf {
- r, n = utf8.DecodeRune(s)
- }
- if !containsRune(cutset, r) {
- break
- }
- s = s[n:]
- }
- return s
- }
- // TrimRight returns a subslice of s by slicing off all trailing
- // UTF-8-encoded code points that are contained in cutset.
- func TrimRight(s []byte, cutset string) []byte {
- if len(s) == 0 || cutset == "" {
- return s
- }
- if len(cutset) == 1 && cutset[0] < utf8.RuneSelf {
- return trimRightByte(s, cutset[0])
- }
- if as, ok := makeASCIISet(cutset); ok {
- return trimRightASCII(s, &as)
- }
- return trimRightUnicode(s, cutset)
- }
- func trimRightByte(s []byte, c byte) []byte {
- for len(s) > 0 && s[len(s)-1] == c {
- s = s[:len(s)-1]
- }
- return s
- }
- func trimRightASCII(s []byte, as *asciiSet) []byte {
- for len(s) > 0 {
- if !as.contains(s[len(s)-1]) {
- break
- }
- s = s[:len(s)-1]
- }
- return s
- }
- func trimRightUnicode(s []byte, cutset string) []byte {
- for len(s) > 0 {
- r, n := rune(s[len(s)-1]), 1
- if r >= utf8.RuneSelf {
- r, n = utf8.DecodeLastRune(s)
- }
- if !containsRune(cutset, r) {
- break
- }
- s = s[:len(s)-n]
- }
- return s
- }
- // TrimSpace returns a subslice of s by slicing off all leading and
- // trailing white space, as defined by Unicode.
- func TrimSpace(s []byte) []byte {
- // Fast path for ASCII: look for the first ASCII non-space byte
- start := 0
- for ; start < len(s); start++ {
- c := s[start]
- if c >= utf8.RuneSelf {
- // If we run into a non-ASCII byte, fall back to the
- // slower unicode-aware method on the remaining bytes
- return TrimFunc(s[start:], unicode.IsSpace)
- }
- if asciiSpace[c] == 0 {
- break
- }
- }
- // Now look for the first ASCII non-space byte from the end
- stop := len(s)
- for ; stop > start; stop-- {
- c := s[stop-1]
- if c >= utf8.RuneSelf {
- return TrimFunc(s[start:stop], unicode.IsSpace)
- }
- if asciiSpace[c] == 0 {
- break
- }
- }
- // At this point s[start:stop] starts and ends with an ASCII
- // non-space bytes, so we're done. Non-ASCII cases have already
- // been handled above.
- if start == stop {
- // Special case to preserve previous TrimLeftFunc behavior,
- // returning nil instead of empty slice if all spaces.
- return nil
- }
- return s[start:stop]
- }
- // Runes interprets s as a sequence of UTF-8-encoded code points.
- // It returns a slice of runes (Unicode code points) equivalent to s.
- func Runes(s []byte) []rune {
- t := make([]rune, utf8.RuneCount(s))
- i := 0
- for len(s) > 0 {
- r, l := utf8.DecodeRune(s)
- t[i] = r
- i++
- s = s[l:]
- }
- return t
- }
- // Replace returns a copy of the slice s with the first n
- // non-overlapping instances of old replaced by new.
- // If old is empty, it matches at the beginning of the slice
- // and after each UTF-8 sequence, yielding up to k+1 replacements
- // for a k-rune slice.
- // If n < 0, there is no limit on the number of replacements.
- func Replace(s, old, new []byte, n int) []byte {
- m := 0
- if n != 0 {
- // Compute number of replacements.
- m = Count(s, old)
- }
- if m == 0 {
- // Just return a copy.
- return append([]byte(nil), s...)
- }
- if n < 0 || m < n {
- n = m
- }
- // Apply replacements to buffer.
- t := make([]byte, len(s)+n*(len(new)-len(old)))
- w := 0
- start := 0
- for i := 0; i < n; i++ {
- j := start
- if len(old) == 0 {
- if i > 0 {
- _, wid := utf8.DecodeRune(s[start:])
- j += wid
- }
- } else {
- j += Index(s[start:], old)
- }
- w += copy(t[w:], s[start:j])
- w += copy(t[w:], new)
- start = j + len(old)
- }
- w += copy(t[w:], s[start:])
- return t[0:w]
- }
- // ReplaceAll returns a copy of the slice s with all
- // non-overlapping instances of old replaced by new.
- // If old is empty, it matches at the beginning of the slice
- // and after each UTF-8 sequence, yielding up to k+1 replacements
- // for a k-rune slice.
- func ReplaceAll(s, old, new []byte) []byte {
- return Replace(s, old, new, -1)
- }
- // EqualFold reports whether s and t, interpreted as UTF-8 strings,
- // are equal under Unicode case-folding, which is a more general
- // form of case-insensitivity.
- func EqualFold(s, t []byte) bool {
- for len(s) != 0 && len(t) != 0 {
- // Extract first rune from each.
- var sr, tr rune
- if s[0] < utf8.RuneSelf {
- sr, s = rune(s[0]), s[1:]
- } else {
- r, size := utf8.DecodeRune(s)
- sr, s = r, s[size:]
- }
- if t[0] < utf8.RuneSelf {
- tr, t = rune(t[0]), t[1:]
- } else {
- r, size := utf8.DecodeRune(t)
- tr, t = r, t[size:]
- }
- // If they match, keep going; if not, return false.
- // Easy case.
- if tr == sr {
- continue
- }
- // Make sr < tr to simplify what follows.
- if tr < sr {
- tr, sr = sr, tr
- }
- // Fast check for ASCII.
- if tr < utf8.RuneSelf {
- // ASCII only, sr/tr must be upper/lower case
- if 'A' <= sr && sr <= 'Z' && tr == sr+'a'-'A' {
- continue
- }
- return false
- }
- // General case. SimpleFold(x) returns the next equivalent rune > x
- // or wraps around to smaller values.
- r := unicode.SimpleFold(sr)
- for r != sr && r < tr {
- r = unicode.SimpleFold(r)
- }
- if r == tr {
- continue
- }
- return false
- }
- // One string is empty. Are both?
- return len(s) == len(t)
- }
- // Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
- func Index(s, sep []byte) int {
- n := len(sep)
- switch {
- case n == 0:
- return 0
- case n == 1:
- return IndexByte(s, sep[0])
- case n == len(s):
- if Equal(sep, s) {
- return 0
- }
- return -1
- case n > len(s):
- return -1
- case n <= bytealg.MaxLen:
- // Use brute force when s and sep both are small
- if len(s) <= bytealg.MaxBruteForce {
- return bytealg.Index(s, sep)
- }
- c0 := sep[0]
- c1 := sep[1]
- i := 0
- t := len(s) - n + 1
- fails := 0
- for i < t {
- if s[i] != c0 {
- // IndexByte is faster than bytealg.Index, so use it as long as
- // we're not getting lots of false positives.
- o := IndexByte(s[i+1:t], c0)
- if o < 0 {
- return -1
- }
- i += o + 1
- }
- if s[i+1] == c1 && Equal(s[i:i+n], sep) {
- return i
- }
- fails++
- i++
- // Switch to bytealg.Index when IndexByte produces too many false positives.
- if fails > bytealg.Cutover(i) {
- r := bytealg.Index(s[i:], sep)
- if r >= 0 {
- return r + i
- }
- return -1
- }
- }
- return -1
- }
- c0 := sep[0]
- c1 := sep[1]
- i := 0
- fails := 0
- t := len(s) - n + 1
- for i < t {
- if s[i] != c0 {
- o := IndexByte(s[i+1:t], c0)
- if o < 0 {
- break
- }
- i += o + 1
- }
- if s[i+1] == c1 && Equal(s[i:i+n], sep) {
- return i
- }
- i++
- fails++
- if fails >= 4+i>>4 && i < t {
- // Give up on IndexByte, it isn't skipping ahead
- // far enough to be better than Rabin-Karp.
- // Experiments (using IndexPeriodic) suggest
- // the cutover is about 16 byte skips.
- // TODO: if large prefixes of sep are matching
- // we should cutover at even larger average skips,
- // because Equal becomes that much more expensive.
- // This code does not take that effect into account.
- j := bytealg.IndexRabinKarpBytes(s[i:], sep)
- if j < 0 {
- return -1
- }
- return i + j
- }
- }
- return -1
- }
- // Cut slices s around the first instance of sep,
- // returning the text before and after sep.
- // The found result reports whether sep appears in s.
- // If sep does not appear in s, cut returns s, nil, false.
- //
- // Cut returns slices of the original slice s, not copies.
- func Cut(s, sep []byte) (before, after []byte, found bool) {
- if i := Index(s, sep); i >= 0 {
- return s[:i], s[i+len(sep):], true
- }
- return s, nil, false
- }
|