| 1 | #ifndef MYCPP_GC_STR_H
|
| 2 | #define MYCPP_GC_STR_H
|
| 3 |
|
| 4 | #include <limits.h> // CHAR_BIT
|
| 5 |
|
| 6 | #include "mycpp/common.h" // DISALLOW_COPY_AND_ASSIGN
|
| 7 | #include "mycpp/gc_obj.h" // GC_OBJ
|
| 8 | #include "mycpp/hash.h" // HashFunc
|
| 9 |
|
| 10 | // https://stackoverflow.com/questions/3919995/determining-sprintf-buffer-size-whats-the-standard/11092994#11092994
|
| 11 | // Notes:
|
| 12 | // - Python 2.7's intobject.c has an erroneous +6
|
| 13 | // - This is 13, but len('-2147483648') is 11, which means we only need 12?
|
| 14 | // - This formula is valid for octal(), because 2^(3 bits) = 8
|
| 15 |
|
| 16 | const int kIntBufSize = CHAR_BIT * sizeof(int) / 3 + 3;
|
| 17 |
|
| 18 | template <typename T>
|
| 19 | class List;
|
| 20 |
|
| 21 | class BigStr {
|
| 22 | public:
|
| 23 | // Don't call this directly. Call NewStr() instead, which calls this.
|
| 24 | BigStr() {
|
| 25 | }
|
| 26 |
|
| 27 | char* data() {
|
| 28 | return data_;
|
| 29 | }
|
| 30 |
|
| 31 | // Call this after writing into buffer created by OverAllocatedStr()
|
| 32 | void MaybeShrink(int str_len);
|
| 33 |
|
| 34 | BigStr* at(int i);
|
| 35 |
|
| 36 | int find(BigStr* needle, int start = 0, int end = -1);
|
| 37 | int rfind(BigStr* needle);
|
| 38 |
|
| 39 | BigStr* slice(int begin);
|
| 40 | BigStr* slice(int begin, int end);
|
| 41 |
|
| 42 | BigStr* strip();
|
| 43 | // Used for CommandSub in osh/cmd_exec.py
|
| 44 | BigStr* rstrip(BigStr* chars);
|
| 45 | BigStr* rstrip();
|
| 46 |
|
| 47 | BigStr* lstrip(BigStr* chars);
|
| 48 | BigStr* lstrip();
|
| 49 |
|
| 50 | BigStr* ljust(int width, BigStr* fillchar);
|
| 51 | BigStr* rjust(int width, BigStr* fillchar);
|
| 52 |
|
| 53 | // Can take (start, end) so Tokens can be compared without allocation
|
| 54 | bool startswith(BigStr* s);
|
| 55 | bool endswith(BigStr* s);
|
| 56 |
|
| 57 | BigStr* replace(BigStr* old, BigStr* new_str);
|
| 58 | BigStr* replace(BigStr* old, BigStr* new_str, int count);
|
| 59 | BigStr* join(List<BigStr*>* items);
|
| 60 |
|
| 61 | List<BigStr*>* split(BigStr* sep);
|
| 62 | List<BigStr*>* split(BigStr* sep, int max_split);
|
| 63 | List<BigStr*>* splitlines(bool keep);
|
| 64 |
|
| 65 | // TODO: Move unicode functions out of mycpp runtime? Because we won't match
|
| 66 | // Python exactly
|
| 67 | bool isdigit();
|
| 68 | bool isalpha();
|
| 69 | bool isupper();
|
| 70 |
|
| 71 | BigStr* upper();
|
| 72 | BigStr* lower();
|
| 73 |
|
| 74 | // Other options for fast comparison / hashing / string interning:
|
| 75 | // - unique_id_: an index into intern table. I don't think this works unless
|
| 76 | // you want to deal with rehashing all strings when the set grows.
|
| 77 | // - although note that the JVM has -XX:StringTableSize=FIXED, which means
|
| 78 | // - it can degrade into linked list performance
|
| 79 | // - Hashed strings become GLOBAL_STR(). Never deallocated.
|
| 80 | // - Hashed strings become part of the "large object space", which might be
|
| 81 | // managed by mark and sweep. This requires linked list overhead.
|
| 82 | // (doubly-linked?)
|
| 83 | // - Intern strings at GARBAGE COLLECTION TIME, with
|
| 84 | // LayoutForwarded::new_location_? Is this possible? Does it introduce
|
| 85 | // too much coupling between strings, hash tables, and GC?
|
| 86 |
|
| 87 | static constexpr ObjHeader obj_header() {
|
| 88 | return ObjHeader::BigStr();
|
| 89 | }
|
| 90 |
|
| 91 | unsigned hash(HashFunc h);
|
| 92 |
|
| 93 | int len_;
|
| 94 | unsigned hash_ : 31;
|
| 95 | unsigned is_hashed_ : 1;
|
| 96 | char data_[1]; // flexible array
|
| 97 |
|
| 98 | private:
|
| 99 | int _strip_left_pos();
|
| 100 | int _strip_right_pos();
|
| 101 |
|
| 102 | DISALLOW_COPY_AND_ASSIGN(BigStr)
|
| 103 | };
|
| 104 |
|
| 105 | constexpr int kStrHeaderSize = offsetof(BigStr, data_);
|
| 106 |
|
| 107 | // Note: for SmallStr, we might copy into the VALUE
|
| 108 | inline void BigStr::MaybeShrink(int str_len) {
|
| 109 | len_ = str_len;
|
| 110 | data_[len_] = '\0'; // NUL terminate
|
| 111 | }
|
| 112 |
|
| 113 | inline int len(const BigStr* s) {
|
| 114 | return s->len_;
|
| 115 | }
|
| 116 |
|
| 117 | BigStr* StrFormat(const char* fmt, ...);
|
| 118 | BigStr* StrFormat(BigStr* fmt, ...);
|
| 119 |
|
| 120 | // NOTE: This iterates over bytes.
|
| 121 | class StrIter {
|
| 122 | public:
|
| 123 | explicit StrIter(BigStr* s) : s_(s), i_(0), len_(len(s)) {
|
| 124 | // Cheney only: s_ could be moved during iteration.
|
| 125 | // gHeap.PushRoot(reinterpret_cast<RawObject**>(&s_));
|
| 126 | }
|
| 127 | ~StrIter() {
|
| 128 | // gHeap.PopRoot();
|
| 129 | }
|
| 130 | void Next() {
|
| 131 | i_++;
|
| 132 | }
|
| 133 | bool Done() {
|
| 134 | return i_ >= len_;
|
| 135 | }
|
| 136 | BigStr* Value(); // similar to at()
|
| 137 |
|
| 138 | private:
|
| 139 | BigStr* s_;
|
| 140 | int i_;
|
| 141 | int len_;
|
| 142 |
|
| 143 | DISALLOW_COPY_AND_ASSIGN(StrIter)
|
| 144 | };
|
| 145 |
|
| 146 | extern BigStr* kEmptyString;
|
| 147 |
|
| 148 | // GlobalStr notes:
|
| 149 | // - sizeof("foo") == 4, for the NUL terminator.
|
| 150 | // - gc_heap_test.cc has a static_assert that GlobalStr matches BigStr. We
|
| 151 | // don't put it here because it triggers -Winvalid-offsetof
|
| 152 |
|
| 153 | template <int N>
|
| 154 | class GlobalStr {
|
| 155 | // A template type with the same layout as BigStr with length N-1 (which needs
|
| 156 | // a buffer of size N). For initializing global constant instances.
|
| 157 | public:
|
| 158 | int len_;
|
| 159 | unsigned hash_ : 31;
|
| 160 | unsigned is_hashed_ : 1;
|
| 161 | const char data_[N];
|
| 162 |
|
| 163 | DISALLOW_COPY_AND_ASSIGN(GlobalStr)
|
| 164 | };
|
| 165 |
|
| 166 | union Str {
|
| 167 | public:
|
| 168 | // Instead of this at the start of every function:
|
| 169 | // Str* s = nullptr;
|
| 170 | // It will now be:
|
| 171 | // Str s(nullptr);
|
| 172 | //
|
| 173 | // StackRoot _root(&s);
|
| 174 | explicit Str(BigStr* big) : big_(big) {
|
| 175 | }
|
| 176 |
|
| 177 | char* data() {
|
| 178 | return big_->data();
|
| 179 | }
|
| 180 |
|
| 181 | Str at(int i) {
|
| 182 | return Str(big_->at(i));
|
| 183 | }
|
| 184 |
|
| 185 | Str upper() {
|
| 186 | return Str(big_->upper());
|
| 187 | }
|
| 188 |
|
| 189 | uint64_t raw_bytes_;
|
| 190 | BigStr* big_;
|
| 191 | // TODO: add SmallStr, see mycpp/small_str_test.cc
|
| 192 | };
|
| 193 |
|
| 194 | inline int len(const Str s) {
|
| 195 | return len(s.big_);
|
| 196 | }
|
| 197 |
|
| 198 | // This macro is a workaround for the fact that it's impossible to have a
|
| 199 | // a constexpr initializer for char[N]. The "String Literals as Non-Type
|
| 200 | // Template Parameters" feature of C++ 20 would have done it, but it's not
|
| 201 | // there.
|
| 202 | //
|
| 203 | // https://old.reddit.com/r/cpp_questions/comments/j0khh6/how_to_constexpr_initialize_class_member_thats/
|
| 204 | // https://stackoverflow.com/questions/10422487/how-can-i-initialize-char-arrays-in-a-constructor
|
| 205 | //
|
| 206 | // TODO: Can we hash values at compile time so they can be in the intern table?
|
| 207 |
|
| 208 | #define GLOBAL_STR(name, val) \
|
| 209 | GcGlobal<GlobalStr<sizeof(val)>> _##name = { \
|
| 210 | ObjHeader::Global(TypeTag::BigStr), \
|
| 211 | {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \
|
| 212 | BigStr* name = reinterpret_cast<BigStr*>(&_##name.obj);
|
| 213 |
|
| 214 | // New style for SmallStr compatibility
|
| 215 | #define GLOBAL_STR2(name, val) \
|
| 216 | GcGlobal<GlobalStr<sizeof(val)>> _##name = { \
|
| 217 | ObjHeader::Global(TypeTag::BigStr), \
|
| 218 | {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \
|
| 219 | Str name(reinterpret_cast<BigStr*>(&_##name.obj));
|
| 220 |
|
| 221 | // Helper function that's consistent with JSON definition of ASCII whitespace,
|
| 222 | // e.g.
|
| 223 | // {"age": \t 42} is OK
|
| 224 | // {"age": \v 42} is NOT OK
|
| 225 | inline bool IsAsciiWhitespace(int ch) {
|
| 226 | return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n';
|
| 227 | }
|
| 228 |
|
| 229 | #endif // MYCPP_GC_STR_H
|