dateparser.timelexer source code

1 /**
2  * Boost Software License - Version 1.0 - August 17th, 2003
3  *
4  * Permission is hereby granted, free of charge, to any person or organization
5  * obtaining a copy of the software and accompanying documentation covered by
6  * this license (the "Software") to use, reproduce, display, distribute,
7  * execute, and transmit the Software, and to prepare derivative works of the
8  * Software, and to permit third-parties to whom the Software is furnished to
9  * do so, all subject to the following:
10  *
11  * The copyright notices in the Software and this entire statement, including
12  * the above license grant, this restriction and the following disclaimer,
13  * must be included in all copies of the Software, in whole or in part, and
14  * all derivative works of the Software, unless such copies or derivative
15  * works are solely in the form of machine-executable object code generated by
16  * a source language processor.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
21  * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
22  * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
23  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24  * DEALINGS IN THE SOFTWARE.
25  */
26 
27 module dateparser.timelexer;
28 
29 debug(dateparser) import std.stdio;
30 import std.range;
31 import std.traits;
32 import std.regex;
33 import dateparser.splitter;
34 
35 package:
36 
37 // Needs to be explicitly flagged global for the backwards compatible
38 // version of splitterWithMatches
39 enum split_decimal = ctRegex!(`([\.,])`, "g");
40 
41 /**
42 * This function breaks the time string into lexical units (tokens), which
43 * can be parsed by the parser. Lexical units are demarcated by changes in
44 * the character set, so any continuous string of letters is considered
45 * one unit, any continuous string of numbers is considered one unit.
46 *
47 * The main complication arises from the fact that dots ('.') can be used
48 * both as separators (e.g. "Sep.20.2009") or decimal points (e.g.
49 * "4:30:21.447"). As such, it is necessary to read the full context of
50 * any dot-separated strings before breaking it into tokens; as such, this
51 * function maintains a "token stack", for when the ambiguous context
52 * demands that multiple tokens be parsed at once.
53 *
54 * Params:
55 *     r = the range to parse
56 * Returns:
57 *     a input range of strings
58 */
59 auto timeLexer(Range)(Range r) if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range))
60 {
61     return TimeLexerResult!Range(r);
62 }
63 
64 // Issue 15831: This should be a Voldemort type, but due to linker slowdown
65 // it's a good idea to put this outside so we don't slowdown people's build
66 // times
67 struct TimeLexerResult(Range)
68 {
69 private:
70     Range source;
71     string charStack;
72     string[] tokenStack;
73     string token;
74     enum State
75     {
76         EMPTY,
77         ALPHA,
78         NUMERIC,
79         ALPHA_PERIOD,
80         PERIOD,
81         NUMERIC_PERIOD
82     }
83 
84 public:
85     this(Range r)
86     {
87         source = r;
88         popFront();
89     }
90 
91     auto front() @property
92     {
93         return token;
94     }
95 
96     void popFront()
97     {
98         import std.algorithm.searching : canFind, count;
99         import std.uni : isAlpha;
100 
101         if (tokenStack.length > 0)
102         {
103             immutable f = tokenStack.front;
104             tokenStack.popFront;
105             token = f;
106             return;
107         }
108 
109         bool seenLetters = false;
110         State state = State.EMPTY;
111         token = string.init;
112 
113         while (!source.empty || !charStack.empty)
114         {
115             // We only realize that we've reached the end of a token when we
116             // find a character that's not part of the current token - since
117             // that character may be part of the next token, it's stored in the
118             // charStack.
119             uint nextChar;
120 
121             if (!charStack.empty)
122             {
123                 nextChar = cast(uint) charStack.front;
124                 charStack.popFront;
125             }
126             else
127             {
128                 nextChar = cast(uint) source.front;
129                 source.popFront;
130             }
131 
132             if (state == State.EMPTY)
133             {
134                 debug(dateparser) writeln("EMPTY");
135                 // First character of the token - determines if we're starting
136                 // to parse a word, a number or something else.
137                 token ~= cast(char) nextChar;
138 
139                 if (nextChar.isAlpha)
140                     state = State.ALPHA;
141                 else if (nextChar.isNumber)
142                     state = State.NUMERIC;
143                 else if (nextChar == ' ')
144                 {
145                     token = " ";
146                     break; //emit token
147                 }
148                 else
149                     break; //emit token
150                 debug(dateparser) writeln("TOKEN ", token, " STATE ", state);
151             }
152             else if (state == State.ALPHA)
153             {
154                 debug(dateparser) writeln("STATE ", state, " nextChar: ", nextChar);
155                 // If we've already started reading a word, we keep reading
156                 // letters until we find something that's not part of a word.
157                 seenLetters = true;
158 
159                 if (nextChar != '.' && nextChar != ',' &&
160                     nextChar != '/' && nextChar != '-' &&
161                     nextChar != ' ' && !nextChar.isNumber)
162                 {
163                     token ~= cast(char) nextChar;
164                 }
165                 else if (nextChar == '.')
166                 {
167                     token ~= cast(char) nextChar;
168                     state = State.ALPHA_PERIOD;
169                 }
170                 else
171                 {
172                     charStack ~= cast(char) nextChar;
173                     break; //emit token
174                 }
175             }
176             else if (state == State.NUMERIC)
177             {
178                 // If we've already started reading a number, we keep reading
179                 // numbers until we find something that doesn't fit.
180                 debug(dateparser) writeln("STATE ", state, " nextChar: ", nextChar);
181                 if (nextChar.isNumber)
182                     token ~= cast(char) nextChar;
183                 else if (nextChar == '.' || (nextChar == ',' && token.length >= 2))
184                 {
185                     token ~= cast(char) nextChar;
186                     state = State.NUMERIC_PERIOD;
187                 }
188                 else
189                 {
190                     charStack ~= cast(char) nextChar;
191                     debug(dateparser) writeln("charStack add: ", charStack);
192                     break; //emit token
193                 }
194             }
195             else if (state == State.ALPHA_PERIOD)
196             {
197                 debug(dateparser) writeln("STATE ", state, " nextChar: ", nextChar);
198                 // If we've seen some letters and a dot separator, continue
199                 // parsing, and the tokens will be broken up later.
200                 seenLetters = true;
201                 if (nextChar == '.' || nextChar.isAlpha)
202                     token ~= cast(char) nextChar;
203                 else if (nextChar.isNumber && token[$ - 1] == '.')
204                 {
205                     token ~= cast(char) nextChar;
206                     state = State.NUMERIC_PERIOD;
207                 }
208                 else
209                 {
210                     charStack ~= cast(char) nextChar;
211                     break; //emit token
212                 }
213             }
214             else if (state == State.NUMERIC_PERIOD)
215             {
216                 debug(dateparser) writeln("STATE ", state, " nextChar: ", nextChar);
217                 // If we've seen at least one dot separator, keep going, we'll
218                 // break up the tokens later.
219                 if (nextChar == '.' || nextChar.isNumber)
220                     token ~= cast(char) nextChar;
221                 else if (nextChar.isAlpha && token[$ - 1] == '.')
222                 {
223                     token ~= cast(char) nextChar;
224                     state = State.ALPHA_PERIOD;
225                 }
226                 else
227                 {
228                     charStack ~= cast(char) nextChar;
229                     break; //emit token
230                 }
231             }
232         }
233 
234         debug(dateparser) writeln("STATE ", state, " seenLetters: ", seenLetters);
235         if ((state == State.ALPHA_PERIOD || state == State.NUMERIC_PERIOD)
236                 && (seenLetters || token.count('.') > 1
237                 || (token[$ - 1] == '.' || token[$ - 1] == ',')))
238             if ((state == State.ALPHA_PERIOD
239                     || state == State.NUMERIC_PERIOD) && (seenLetters
240                     || token.count('.') > 1 || (token[$ - 1] == '.' || token[$ - 1] == ',')))
241             {
242                 auto l = splitterWithMatches(token[], split_decimal);
243                 token = l.front;
244                 l.popFront;
245 
246                 foreach (tok; l)
247                     if (tok.length > 0)
248                         tokenStack ~= tok;
249             }
250 
251         if (state == State.NUMERIC_PERIOD && !token.canFind('.'))
252             token = token.replace(",", ".");
253     }
254 
255     bool empty()() @property
256     {
257         return token.empty && source.empty && charStack.empty && tokenStack.empty;
258     }
259 }
260 
261 unittest
262 {
263     import std.algorithm.comparison : equal;
264 
265     assert("Thu Sep 25 10:36:28 BRST 2003".timeLexer.equal(
266         ["Thu", " ", "Sep", " ", "25", " ",
267          "10", ":", "36", ":", "28", " ",
268          "BRST", " ", "2003"]
269     ));
270 
271     assert("2003-09-25T10:49:41.5-03:00".timeLexer.equal(
272         ["2003", "-", "09", "-", "25", "T",
273          "10", ":", "49", ":", "41.5", "-",
274          "03", ":", "00"]
275     ));
276 }
277 
278 /++
279     Params: c = The character to test.
280     Returns: Whether `c` is a number (0..9).
281 +/
282 pragma(inline, true)
283 bool isNumber(dchar c) @safe pure nothrow @nogc
284 {
285     return c >= '0' && c <= '9';
286 }
287 
288 unittest
289 {
290     import std.internal.test.dummyrange : ReferenceInputRange;
291     import std.algorithm.comparison : equal;
292 
293     auto a = new ReferenceInputRange!dchar("10:10");
294     assert(a.timeLexer.equal(["10", ":", "10"]));
295 
296     auto b = new ReferenceInputRange!dchar("Thu Sep 10:36:28");
297     assert(b.timeLexer.equal(["Thu", " ", "Sep", " ", "10", ":", "36", ":", "28"]));
298 }