1 | #ifndef MYCPP_GC_STR_H
|
2 | #define MYCPP_GC_STR_H
|
3 |
|
4 | #include <limits.h> // CHAR_BIT
|
5 |
|
6 | #include "mycpp/common.h" // DISALLOW_COPY_AND_ASSIGN
|
7 | #include "mycpp/gc_obj.h" // GC_OBJ
|
8 | #include "mycpp/hash.h" // HashFunc
|
9 |
|
10 | // https://stackoverflow.com/questions/3919995/determining-sprintf-buffer-size-whats-the-standard/11092994#11092994
|
11 | // Notes:
|
12 | // - Python 2.7's intobject.c has an erroneous +6
|
13 | // - This is 13, but len('-2147483648') is 11, which means we only need 12?
|
14 | // - This formula is valid for octal(), because 2^(3 bits) = 8
|
15 |
|
16 | const int kIntBufSize = CHAR_BIT * sizeof(int) / 3 + 3;
|
17 |
|
18 | template <typename T>
|
19 | class List;
|
20 |
|
21 | class BigStr {
|
22 | public:
|
23 | // Don't call this directly. Call NewStr() instead, which calls this.
|
24 | BigStr() {
|
25 | }
|
26 |
|
27 | char* data() {
|
28 | return data_;
|
29 | }
|
30 |
|
31 | // Call this after writing into buffer created by OverAllocatedStr()
|
32 | void MaybeShrink(int str_len);
|
33 |
|
34 | BigStr* at(int i);
|
35 |
|
36 | int find(BigStr* needle, int start = 0, int end = -1);
|
37 | int rfind(BigStr* needle);
|
38 |
|
39 | BigStr* slice(int begin);
|
40 | BigStr* slice(int begin, int end);
|
41 |
|
42 | BigStr* strip();
|
43 | // Used for CommandSub in osh/cmd_exec.py
|
44 | BigStr* rstrip(BigStr* chars);
|
45 | BigStr* rstrip();
|
46 |
|
47 | BigStr* lstrip(BigStr* chars);
|
48 | BigStr* lstrip();
|
49 |
|
50 | BigStr* ljust(int width, BigStr* fillchar);
|
51 | BigStr* rjust(int width, BigStr* fillchar);
|
52 |
|
53 | // Can take (start, end) so Tokens can be compared without allocation
|
54 | bool startswith(BigStr* s);
|
55 | bool endswith(BigStr* s);
|
56 |
|
57 | BigStr* replace(BigStr* old, BigStr* new_str);
|
58 | BigStr* replace(BigStr* old, BigStr* new_str, int count);
|
59 | BigStr* join(List<BigStr*>* items);
|
60 |
|
61 | List<BigStr*>* split(BigStr* sep);
|
62 | List<BigStr*>* split(BigStr* sep, int max_split);
|
63 | List<BigStr*>* splitlines(bool keep);
|
64 |
|
65 | // TODO: Move unicode functions out of mycpp runtime? Because we won't match
|
66 | // Python exactly
|
67 | bool isdigit();
|
68 | bool isalpha();
|
69 | bool isupper();
|
70 |
|
71 | BigStr* upper();
|
72 | BigStr* lower();
|
73 |
|
74 | // Other options for fast comparison / hashing / string interning:
|
75 | // - unique_id_: an index into intern table. I don't think this works unless
|
76 | // you want to deal with rehashing all strings when the set grows.
|
77 | // - although note that the JVM has -XX:StringTableSize=FIXED, which means
|
78 | // - it can degrade into linked list performance
|
79 | // - Hashed strings become GLOBAL_STR(). Never deallocated.
|
80 | // - Hashed strings become part of the "large object space", which might be
|
81 | // managed by mark and sweep. This requires linked list overhead.
|
82 | // (doubly-linked?)
|
83 | // - Intern strings at GARBAGE COLLECTION TIME, with
|
84 | // LayoutForwarded::new_location_? Is this possible? Does it introduce
|
85 | // too much coupling between strings, hash tables, and GC?
|
86 |
|
87 | static constexpr ObjHeader obj_header() {
|
88 | return ObjHeader::BigStr();
|
89 | }
|
90 |
|
91 | unsigned hash(HashFunc h);
|
92 |
|
93 | int len_;
|
94 | unsigned hash_ : 31;
|
95 | unsigned is_hashed_ : 1;
|
96 | char data_[1]; // flexible array
|
97 |
|
98 | private:
|
99 | int _strip_left_pos();
|
100 | int _strip_right_pos();
|
101 |
|
102 | DISALLOW_COPY_AND_ASSIGN(BigStr)
|
103 | };
|
104 |
|
105 | constexpr int kStrHeaderSize = offsetof(BigStr, data_);
|
106 |
|
107 | // Note: for SmallStr, we might copy into the VALUE
|
108 | inline void BigStr::MaybeShrink(int str_len) {
|
109 | len_ = str_len;
|
110 | data_[len_] = '\0'; // NUL terminate
|
111 | }
|
112 |
|
113 | inline int len(const BigStr* s) {
|
114 | return s->len_;
|
115 | }
|
116 |
|
117 | BigStr* StrFormat(const char* fmt, ...);
|
118 | BigStr* StrFormat(BigStr* fmt, ...);
|
119 |
|
120 | // NOTE: This iterates over bytes.
|
121 | class StrIter {
|
122 | public:
|
123 | explicit StrIter(BigStr* s) : s_(s), i_(0), len_(len(s)) {
|
124 | // Cheney only: s_ could be moved during iteration.
|
125 | // gHeap.PushRoot(reinterpret_cast<RawObject**>(&s_));
|
126 | }
|
127 | ~StrIter() {
|
128 | // gHeap.PopRoot();
|
129 | }
|
130 | void Next() {
|
131 | i_++;
|
132 | }
|
133 | bool Done() {
|
134 | return i_ >= len_;
|
135 | }
|
136 | BigStr* Value(); // similar to at()
|
137 |
|
138 | private:
|
139 | BigStr* s_;
|
140 | int i_;
|
141 | int len_;
|
142 |
|
143 | DISALLOW_COPY_AND_ASSIGN(StrIter)
|
144 | };
|
145 |
|
146 | extern BigStr* kEmptyString;
|
147 |
|
148 | // GlobalStr notes:
|
149 | // - sizeof("foo") == 4, for the NUL terminator.
|
150 | // - gc_heap_test.cc has a static_assert that GlobalStr matches BigStr. We
|
151 | // don't put it here because it triggers -Winvalid-offsetof
|
152 |
|
153 | template <int N>
|
154 | class GlobalStr {
|
155 | // A template type with the same layout as BigStr with length N-1 (which needs
|
156 | // a buffer of size N). For initializing global constant instances.
|
157 | public:
|
158 | int len_;
|
159 | unsigned hash_ : 31;
|
160 | unsigned is_hashed_ : 1;
|
161 | const char data_[N];
|
162 |
|
163 | DISALLOW_COPY_AND_ASSIGN(GlobalStr)
|
164 | };
|
165 |
|
166 | union Str {
|
167 | public:
|
168 | // Instead of this at the start of every function:
|
169 | // Str* s = nullptr;
|
170 | // It will now be:
|
171 | // Str s(nullptr);
|
172 | //
|
173 | // StackRoot _root(&s);
|
174 | explicit Str(BigStr* big) : big_(big) {
|
175 | }
|
176 |
|
177 | char* data() {
|
178 | return big_->data();
|
179 | }
|
180 |
|
181 | Str at(int i) {
|
182 | return Str(big_->at(i));
|
183 | }
|
184 |
|
185 | Str upper() {
|
186 | return Str(big_->upper());
|
187 | }
|
188 |
|
189 | uint64_t raw_bytes_;
|
190 | BigStr* big_;
|
191 | // TODO: add SmallStr, see mycpp/small_str_test.cc
|
192 | };
|
193 |
|
194 | inline int len(const Str s) {
|
195 | return len(s.big_);
|
196 | }
|
197 |
|
198 | // This macro is a workaround for the fact that it's impossible to have a
|
199 | // a constexpr initializer for char[N]. The "String Literals as Non-Type
|
200 | // Template Parameters" feature of C++ 20 would have done it, but it's not
|
201 | // there.
|
202 | //
|
203 | // https://old.reddit.com/r/cpp_questions/comments/j0khh6/how_to_constexpr_initialize_class_member_thats/
|
204 | // https://stackoverflow.com/questions/10422487/how-can-i-initialize-char-arrays-in-a-constructor
|
205 | //
|
206 | // TODO: Can we hash values at compile time so they can be in the intern table?
|
207 |
|
208 | #define GLOBAL_STR(name, val) \
|
209 | GcGlobal<GlobalStr<sizeof(val)>> _##name = { \
|
210 | ObjHeader::Global(TypeTag::BigStr), \
|
211 | {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \
|
212 | BigStr* name = reinterpret_cast<BigStr*>(&_##name.obj);
|
213 |
|
214 | // New style for SmallStr compatibility
|
215 | #define GLOBAL_STR2(name, val) \
|
216 | GcGlobal<GlobalStr<sizeof(val)>> _##name = { \
|
217 | ObjHeader::Global(TypeTag::BigStr), \
|
218 | {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \
|
219 | Str name(reinterpret_cast<BigStr*>(&_##name.obj));
|
220 |
|
221 | // Helper function that's consistent with JSON definition of ASCII whitespace,
|
222 | // e.g.
|
223 | // {"age": \t 42} is OK
|
224 | // {"age": \v 42} is NOT OK
|
225 | inline bool IsAsciiWhitespace(int ch) {
|
226 | return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n';
|
227 | }
|
228 |
|
229 | #endif // MYCPP_GC_STR_H
|