OILS / mycpp / gc_str.h View on Github | oils.pub

229 lines, 112 significant
1#ifndef MYCPP_GC_STR_H
2#define MYCPP_GC_STR_H
3
4#include <limits.h> // CHAR_BIT
5
6#include "mycpp/common.h" // DISALLOW_COPY_AND_ASSIGN
7#include "mycpp/gc_obj.h" // GC_OBJ
8#include "mycpp/hash.h" // HashFunc
9
10// https://stackoverflow.com/questions/3919995/determining-sprintf-buffer-size-whats-the-standard/11092994#11092994
11// Notes:
12// - Python 2.7's intobject.c has an erroneous +6
13// - This is 13, but len('-2147483648') is 11, which means we only need 12?
14// - This formula is valid for octal(), because 2^(3 bits) = 8
15
16const int kIntBufSize = CHAR_BIT * sizeof(int) / 3 + 3;
17
18template <typename T>
19class List;
20
21class BigStr {
22 public:
23 // Don't call this directly. Call NewStr() instead, which calls this.
24 BigStr() {
25 }
26
27 char* data() {
28 return data_;
29 }
30
31 // Call this after writing into buffer created by OverAllocatedStr()
32 void MaybeShrink(int str_len);
33
34 BigStr* at(int i);
35
36 int find(BigStr* needle, int start = 0, int end = -1);
37 int rfind(BigStr* needle);
38
39 BigStr* slice(int begin);
40 BigStr* slice(int begin, int end);
41
42 BigStr* strip();
43 // Used for CommandSub in osh/cmd_exec.py
44 BigStr* rstrip(BigStr* chars);
45 BigStr* rstrip();
46
47 BigStr* lstrip(BigStr* chars);
48 BigStr* lstrip();
49
50 BigStr* ljust(int width, BigStr* fillchar);
51 BigStr* rjust(int width, BigStr* fillchar);
52
53 // Can take (start, end) so Tokens can be compared without allocation
54 bool startswith(BigStr* s);
55 bool endswith(BigStr* s);
56
57 BigStr* replace(BigStr* old, BigStr* new_str);
58 BigStr* replace(BigStr* old, BigStr* new_str, int count);
59 BigStr* join(List<BigStr*>* items);
60
61 List<BigStr*>* split(BigStr* sep);
62 List<BigStr*>* split(BigStr* sep, int max_split);
63 List<BigStr*>* splitlines(bool keep);
64
65 // TODO: Move unicode functions out of mycpp runtime? Because we won't match
66 // Python exactly
67 bool isdigit();
68 bool isalpha();
69 bool isupper();
70
71 BigStr* upper();
72 BigStr* lower();
73
74 // Other options for fast comparison / hashing / string interning:
75 // - unique_id_: an index into intern table. I don't think this works unless
76 // you want to deal with rehashing all strings when the set grows.
77 // - although note that the JVM has -XX:StringTableSize=FIXED, which means
78 // - it can degrade into linked list performance
79 // - Hashed strings become GLOBAL_STR(). Never deallocated.
80 // - Hashed strings become part of the "large object space", which might be
81 // managed by mark and sweep. This requires linked list overhead.
82 // (doubly-linked?)
83 // - Intern strings at GARBAGE COLLECTION TIME, with
84 // LayoutForwarded::new_location_? Is this possible? Does it introduce
85 // too much coupling between strings, hash tables, and GC?
86
87 static constexpr ObjHeader obj_header() {
88 return ObjHeader::BigStr();
89 }
90
91 unsigned hash(HashFunc h);
92
93 int len_;
94 unsigned hash_ : 31;
95 unsigned is_hashed_ : 1;
96 char data_[1]; // flexible array
97
98 private:
99 int _strip_left_pos();
100 int _strip_right_pos();
101
102 DISALLOW_COPY_AND_ASSIGN(BigStr)
103};
104
105constexpr int kStrHeaderSize = offsetof(BigStr, data_);
106
107// Note: for SmallStr, we might copy into the VALUE
108inline void BigStr::MaybeShrink(int str_len) {
109 len_ = str_len;
110 data_[len_] = '\0'; // NUL terminate
111}
112
113inline int len(const BigStr* s) {
114 return s->len_;
115}
116
117BigStr* StrFormat(const char* fmt, ...);
118BigStr* StrFormat(BigStr* fmt, ...);
119
120// NOTE: This iterates over bytes.
121class StrIter {
122 public:
123 explicit StrIter(BigStr* s) : s_(s), i_(0), len_(len(s)) {
124 // Cheney only: s_ could be moved during iteration.
125 // gHeap.PushRoot(reinterpret_cast<RawObject**>(&s_));
126 }
127 ~StrIter() {
128 // gHeap.PopRoot();
129 }
130 void Next() {
131 i_++;
132 }
133 bool Done() {
134 return i_ >= len_;
135 }
136 BigStr* Value(); // similar to at()
137
138 private:
139 BigStr* s_;
140 int i_;
141 int len_;
142
143 DISALLOW_COPY_AND_ASSIGN(StrIter)
144};
145
146extern BigStr* kEmptyString;
147
148// GlobalStr notes:
149// - sizeof("foo") == 4, for the NUL terminator.
150// - gc_heap_test.cc has a static_assert that GlobalStr matches BigStr. We
151// don't put it here because it triggers -Winvalid-offsetof
152
153template <int N>
154class GlobalStr {
155 // A template type with the same layout as BigStr with length N-1 (which needs
156 // a buffer of size N). For initializing global constant instances.
157 public:
158 int len_;
159 unsigned hash_ : 31;
160 unsigned is_hashed_ : 1;
161 const char data_[N];
162
163 DISALLOW_COPY_AND_ASSIGN(GlobalStr)
164};
165
166union Str {
167 public:
168 // Instead of this at the start of every function:
169 // Str* s = nullptr;
170 // It will now be:
171 // Str s(nullptr);
172 //
173 // StackRoot _root(&s);
174 explicit Str(BigStr* big) : big_(big) {
175 }
176
177 char* data() {
178 return big_->data();
179 }
180
181 Str at(int i) {
182 return Str(big_->at(i));
183 }
184
185 Str upper() {
186 return Str(big_->upper());
187 }
188
189 uint64_t raw_bytes_;
190 BigStr* big_;
191 // TODO: add SmallStr, see mycpp/small_str_test.cc
192};
193
194inline int len(const Str s) {
195 return len(s.big_);
196}
197
198// This macro is a workaround for the fact that it's impossible to have a
199// a constexpr initializer for char[N]. The "String Literals as Non-Type
200// Template Parameters" feature of C++ 20 would have done it, but it's not
201// there.
202//
203// https://old.reddit.com/r/cpp_questions/comments/j0khh6/how_to_constexpr_initialize_class_member_thats/
204// https://stackoverflow.com/questions/10422487/how-can-i-initialize-char-arrays-in-a-constructor
205//
206// TODO: Can we hash values at compile time so they can be in the intern table?
207
208#define GLOBAL_STR(name, val) \
209 GcGlobal<GlobalStr<sizeof(val)>> _##name = { \
210 ObjHeader::Global(TypeTag::BigStr), \
211 {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \
212 BigStr* name = reinterpret_cast<BigStr*>(&_##name.obj);
213
214// New style for SmallStr compatibility
215#define GLOBAL_STR2(name, val) \
216 GcGlobal<GlobalStr<sizeof(val)>> _##name = { \
217 ObjHeader::Global(TypeTag::BigStr), \
218 {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \
219 Str name(reinterpret_cast<BigStr*>(&_##name.obj));
220
221// Helper function that's consistent with JSON definition of ASCII whitespace,
222// e.g.
223// {"age": \t 42} is OK
224// {"age": \v 42} is NOT OK
225inline bool IsAsciiWhitespace(int ch) {
226 return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n';
227}
228
229#endif // MYCPP_GC_STR_H