1 /**
2  * Provides a utility class that preserves the minimal memory requirements of a string and combines
3  * it with the ability to access characters by grapheme, e.g. a visible character.
4  */
5 module gstring;
6 
7 import std.traits : isSomeChar, isSomeString;
8 import std.uni : Grapheme, byGrapheme;
9 import std.range : isInputRange, ElementType;
10 
11 public import std.string : CaseSensitive;
12 
13 @safe:
14 
15 /// An alias for GString using `char` (UTF-8).
16 alias CGString = GString!char;
17 
18 /// An alias for GString using `wchar` (UTF-16).
19 alias WGString = GString!wchar;
20 
21 /// An alias for GString using `dchar` (UTF-32).
22 alias DGString = GString!dchar;
23 
24 /**
25  * A `GString`, or grapheme-string, is a unicode-aware string supporting simple array-index
26  * operations that operate on graphemes (displayable characters).
27  *
28  * For example, consider the string "R̆um". At what array index is the character 'u'? Because this is
29  * a UTF-8 encoded string, it is actually at index 3, not 1. "R̆" is composed of 2 unicode
30  * code-points, "R" and " ̆ " (combining breve). The "R" code-point is encoded in a single byte, but
31  * the combining breve requires two bytes. These two code-points are combined into a single visible
32  * character (grapheme), "R̆".
33  *
34  * This type permits simple array-index operations based on graphemes, that is, the visible position
35  * of characters. It uses a string as its undrelying storage without an additional memory footprint.
36  * However, array-like operations tend to have `O(n)` complexity, because the type must compute the
37  * offset of each grapheme. Thus, `GString` is best used with smaller strings, especially those that
38  * need to perform operations by character position, such as in user interfaces.
39  *
40  * If memory is not an issue, and array-like operations by Grapheme are needed for very large
41  * strings, it is better to use `dchar[]` or `Grapheme[]`.
42  *
43  * Params:
44  * CharT = Controls the type of encoding to use. `char` => UTF-8, `wchar` => UTF-16, and
45  *         `dchar` => UTF-32.
46  */
47 struct GString(CharT)
48 if (isSomeChar!CharT) {
49   alias StringT = immutable(CharT)[];
50 
51   /// The underlying stored string using UTF encoding according to CharT.
52   StringT rawString;
53 
54   this(StringT str) {
55     this.rawString = str;
56   }
57 
58   this(R)(R str)
59   if (isSomeString!R) {
60     this(toStringT(str));
61   }
62 
63   /**
64    * A GString can be used with libraries that operate on normal strings, however, they are treated
65    * like normal strings, and these libraries should be used with caution.
66    */
67   alias rawString this;
68 
69   ///
70   unittest {
71     import std.string : capitalize;
72     auto t1 = typeof(this)("test");
73     assert(capitalize(t1) == "Test");
74   }
75 
76   private static StringT toStringT(R)(R str)
77   if (isSomeString!R) {
78     static if (is(CharT == char)) {
79       import std.utf : toUTF8;
80       return str.toUTF8();
81     } else static if (is(CharT == wchar)) {
82       import std.utf : toUTF16;
83       return str.toUTF16();
84     } else static if (is(CharT == dchar)) {
85       import std.utf : toUTF32;
86       return str.toUTF32();
87     }
88   }
89 
90   private static StringT toStringT(C)(C ch)
91   if (isSomeChar!C) {
92     static if (is(CharT == char)) {
93       import std.conv : text;
94       return toStringT(text(ch));
95     } else static if (is(CharT == wchar)) {
96       import std.conv : wtext;
97       return toStringT(wtext(ch));
98     } else static if (is(CharT == dchar)) {
99       import std.conv : dtext;
100       return toStringT(dtext(ch));
101     }
102   }
103 
104   string toString() const {
105     import std.utf : toUTF8;
106     return toUTF8(rawString);
107   }
108 
109   wstring toWString() const {
110     import std.utf : toUTF16;
111     return toUTF16(rawString);
112   }
113 
114   dstring toDString() const {
115     import std.utf : toUTF32;
116     return toUTF32(rawString);
117   }
118 
119   /**
120    * Returns the i'th displayable character (Grapheme) within the `GString`.
121    *
122    * Complexity: `O(i)`. Each Grapheme is composed of a variable number of code-points, and
123    * code-poinst are composed of a variable number of characters (code-units).
124    */
125   Grapheme opIndex(size_t i) {
126     import std.range : drop;
127     return rawString.byGrapheme.drop(i).front;
128   }
129 
130   ///
131   unittest {
132     auto t1 = typeof(this)("Test R̆ȧm͆b̪õ.");
133     assert(t1[3].length == 1);
134     assert(t1[3][0] == 't');
135     assert(t1[5].length == 2);
136     assert(t1[5][0] == 'R' && t1[5][1] == '\u0306');
137     assert(t1[6].length == 2);
138     assert(t1[6][0] == 'a' && t1[6][1] == '\u0307');
139   }
140 
141   /**
142    * Returns a range of Graphemes representing the original string.
143    *
144    * To retrieve a string, use `toString()`, `toWString()`, or `toDString()`.
145    *
146    * Complexity: `O(length)`
147    */
148   Grapheme[] opIndex() {
149     import std.array : array;
150     return rawString.byGrapheme.array;
151   }
152 
153   /**
154    * Creates a data structure to represent the 0'th index during array slicing,
155    * e.g. `gstring[i..j]`.
156    */
157   size_t[2] opSlice(size_t dim = 0)(size_t i, size_t j)
158   {
159     return [i, j];
160   }
161 
162   Grapheme[] opIndex()(size_t[2] slice) {
163     import std.range : drop, take, array;
164     return rawString.byGrapheme.drop(slice[0]).take(slice[1]-slice[0]).array;
165   }
166 
167   unittest {
168     auto t1 = typeof(this)("Test R̆ȧm͆b̪õ");
169     assert(t1[2..7] == typeof(this)("st R̆ȧ")[]);
170   }
171 
172   /// Replaces a single grapheme in the GString.
173   void opIndexAssign(R)(R str, size_t i)
174   if (isSomeString!R) {
175     opIndexAssign(Grapheme(str), i);
176   }
177 
178   /// ditto
179   void opIndexAssign(C)(C ch, size_t i)
180   if (isSomeChar!C) {
181     opIndexAssign(Grapheme(ch), i);
182   }
183 
184   /// ditto
185   void opIndexAssign(Grapheme g, size_t i) {
186     import std.uni : graphemeStride;
187     import std.array : array;
188 
189     ptrdiff_t charIdx = 0;
190     for (int idx = 0; idx < i; idx++, charIdx += rawString.graphemeStride(charIdx)) {}
191 
192     ptrdiff_t nextCharIdx = charIdx + rawString.graphemeStride(charIdx);
193     rawString = rawString[0..charIdx] ~ toStringT(g[].array) ~ rawString[nextCharIdx..$];
194   }
195 
196   ///
197   unittest {
198     auto t1 = typeof(this)("Test R̆ȧm͆b̪õ");
199     t1[6] = Grapheme("a");
200     assert(t1.rawString == "Test R̆am͆b̪õ");  // Reduces string size.
201     t1[6] = Grapheme("ȧ");
202     assert(t1.rawString == "Test R̆ȧm͆b̪õ");  // Increases string size.
203     t1[7] = "m";
204     assert(t1.rawString == "Test R̆ȧmb̪õ");
205     t1[8] = 'b';
206     assert(t1.rawString == "Test R̆ȧmbõ");
207   }
208 
209   /// Enables the use of `foreach (Grapheme; ...)` statements.
210   int opApply(scope int delegate (Grapheme) @safe dg) {
211     foreach (grapheme; rawString.byGrapheme) {
212       int result = dg(grapheme);
213       if (result)
214         return result;
215     }
216     return 0;
217   }
218 
219   ///
220   unittest {
221     auto t1 = typeof(this)("Test R̆ȧm͆b̪õ");
222     StringT nakedString;
223     StringT nakedExpected = "Test Rambo";
224     foreach (g; t1) {
225       nakedString ~= g[0];
226     }
227     assert(nakedString == nakedExpected);
228   }
229 
230   /// Enables the use of `foreach (size_t, Grapheme; ...)` statements.
231   int opApply(scope int delegate (size_t index, Grapheme) @safe dg) {
232     size_t index = 0;
233     foreach (grapheme; rawString.byGrapheme) {
234       int result = dg(index, grapheme);
235       if (result)
236         return result;
237       index++;
238     }
239     return 0;
240   }
241 
242   ///
243   unittest {
244     auto t1 = typeof(this)("Test R̆ȧm͆b̪õ");
245     StringT nakedString;
246     StringT nakedExpected = "Test Rambo";
247     size_t isum = 0;
248     foreach (i, g; t1) {
249       isum += i;
250       nakedString ~= g[0];
251     }
252     assert(isum == 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9);
253     assert(nakedString == nakedExpected);
254   }
255 
256   /// A binary operator to allow appending with a Grapheme.
257   typeof(this) opBinary(string op : "~")(Grapheme g) {
258     import std.array : array;
259     return typeof(this)(rawString ~ toStringT(g[].array));
260   }
261 
262   /// ditto
263   typeof(this) opBinaryRight(string op : "~")(Grapheme g) {
264     import std.array : array;
265     return typeof(this)(toStringT(g[].array) ~ rawString);
266   }
267 
268   ///
269   unittest {
270     auto t1 = typeof(this)("Test ");
271     auto t2 = t1 ~ Grapheme("A");
272     assert(t2.rawString == "Test A");
273     auto t3 = Grapheme("B") ~ t1;
274     assert(t3.rawString == "BTest ");
275   }
276 
277   /// A binary operator to allow appending with arbitrary strings.
278   typeof(this) opBinary(string op : "~", R)(R str)
279   if (isSomeString!R) {
280     return typeof(this)(rawString ~ toStringT(str));
281   }
282 
283   /// ditto
284   typeof(this) opBinaryRight(string op : "~", R)(R str)
285   if (isSomeString!R) {
286     return typeof(this)(toStringT(str) ~ rawString);
287   }
288 
289   ///
290   unittest {
291     auto t1 = typeof(this)("Test ");
292     auto t2 = t1 ~ "A";
293     assert(t2.rawString == "Test A");
294     auto t3 = t1 ~ "B"w;
295     assert(t3.rawString == "Test B");
296     auto t4 = t1 ~ "C"d;
297     assert(t4.rawString == "Test C");
298     auto t5 = "D"w ~ t1;
299     assert(t5.rawString == "DTest ");
300   }
301 
302   /// A binary operator supporting appending a single character.
303   typeof(this) opBinary(string op : "~", C)(C ch)
304   if (isSomeChar!C) {
305     import std.conv : to;
306     return typeof(this)(rawString ~ to!CharT(ch));
307   }
308 
309   /// ditto
310   typeof(this) opBinaryRight(string op : "~", C)(C ch)
311   if (isSomeChar!C) {
312     import std.conv : to;
313     return typeof(this)(to!CharT(ch) ~ rawString);
314   }
315 
316   ///
317   unittest {
318     auto t1 = typeof(this)("Test ");
319     auto t2 = t1 ~ 'A';
320     assert(t2.rawString == "Test A");
321     auto t3 = t1 ~ cast(wchar) 'B';
322     assert(t3.rawString == "Test B");
323     auto t4 = t1 ~ cast(dchar) 'C';
324     assert(t4.rawString == "Test C");
325     auto t5 = cast(dchar) 'D' ~ t1;
326     assert(t5.rawString == "DTest ");
327   }
328 
329   /// Allows appending to an existing GString using the `~=` operator.
330   void opOpAssign(string op : "~", R)(R str)
331   if (isSomeString!R) {
332     this.rawString ~= toStringT(str);
333   }
334 
335   /// ditto
336   void opOpAssign(string op : "~", C)(C ch)
337   if (isSomeChar!C) {
338     this.rawString ~= toStringT(ch);
339   }
340 
341   /// ditto
342   void opOpAssign(string op : "~")(Grapheme g) {
343     import std.array : array;
344     this.rawString ~= toStringT(g[].array);
345   }
346 
347   ///
348   unittest {
349     auto t1 = typeof(this)("Test");
350     t1 ~= "R᪶";
351     assert(t1.rawString == "TestR᪶");
352     t1 ~= 'ö';
353     assert(t1.rawString == "TestR᪶ö");
354     t1 ~= Grapheme("b");
355     assert(t1.rawString == "TestR᪶öb");
356   }
357 
358   /// Finds the grapheme-index of the given search string.
359   ptrdiff_t indexOf(R)(R str)
360   if (isSomeString!R) {
361     import std.algorithm : startsWith;
362     import std.uni : graphemeStride;
363 
364     StringT target = toStringT(str);
365     ptrdiff_t index = 0;
366     for (ptrdiff_t i = 0; index < rawString.length; index += rawString.graphemeStride(index), i++) {
367       if (rawString[index..$].startsWith(target)) {
368         return i;
369       }
370     }
371     return -1;
372   }
373 
374   ///
375   unittest {
376     auto t1 = typeof(this)("Test R̆ȧm͆b̪õ R̆ȧm͆b̪õ");
377     assert(t1.indexOf("ȧ") == 6);
378     assert(t1.indexOf("m̪") == -1);
379   }
380 
381 }
382 
383 ///
384 unittest {
385   auto t1 = CGString("Test R̆ȧm͆b̪õ");
386   assert(t1.indexOf("ȧ") == 6);
387   assert(t1.indexOf("m̪") == -1);
388   t1[9] = "í";
389   assert(t1.toString() == "Test R̆ȧm͆b̪í");
390 }