dateparser.timelexer source code

1 /**
2  * Boost Software License - Version 1.0 - August 17th, 2003
3  *
4  * Permission is hereby granted, free of charge, to any person or organization
5  * obtaining a copy of the software and accompanying documentation covered by
6  * this license (the "Software") to use, reproduce, display, distribute,
7  * execute, and transmit the Software, and to prepare derivative works of the
8  * Software, and to permit third-parties to whom the Software is furnished to
9  * do so, all subject to the following:
10  *
11  * The copyright notices in the Software and this entire statement, including
12  * the above license grant, this restriction and the following disclaimer,
13  * must be included in all copies of the Software, in whole or in part, and
14  * all derivative works of the Software, unless such copies or derivative
15  * works are solely in the form of machine-executable object code generated by
16  * a source language processor.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
21  * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
22  * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
23  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24  * DEALINGS IN THE SOFTWARE.
25  */
26 
27 module dateparser.timelexer;
28 
29 debug(dateparser) import std.stdio;
30 import std.range;
31 import std.traits;
32 import std.regex;
33 import dateparser.splitter;
34 
35 private enum State
36 {
37     EMPTY,
38     ALPHA,
39     NUMERIC,
40     ALPHA_PERIOD,
41     PERIOD,
42     NUMERIC_PERIOD
43 }
44 
45 package:
46 
47 // Needs to be explicitly flagged global for the backwards compatible
48 // version of splitterWithMatches
49 enum split_decimal = ctRegex!(`([\.,])`, "g");
50 
51 /**
52 * This function breaks the time string into lexical units (tokens), which
53 * can be parsed by the parser. Lexical units are demarcated by changes in
54 * the character set, so any continuous string of letters is considered
55 * one unit, any continuous string of numbers is considered one unit.
56 *
57 * The main complication arises from the fact that dots ('.') can be used
58 * both as separators (e.g. "Sep.20.2009") or decimal points (e.g.
59 * "4:30:21.447"). As such, it is necessary to read the full context of
60 * any dot-separated strings before breaking it into tokens; as such, this
61 * function maintains a "token stack", for when the ambiguous context
62 * demands that multiple tokens be parsed at once.
63 *
64 * Params:
65 *     r = the range to parse
66 * Returns:
67 *     a input range of strings
68 */
69 auto timeLexer(Range)(Range r) if (isInputRange!Range && is(Unqual!(ElementType!Range) == char))
70 {
71     return TimeLexerResult!Range(r);
72 }
73 
74 // Issue 15831: This should be a Voldemort type, but due to linker slowdown
75 // it's a good idea to put this outside so we don't slowdown people's build
76 // times
77 struct TimeLexerResult(Range)
78 {
79 private:
80     Range source;
81     string charStack;
82     string[] tokenStack;
83     string token;
84 
85 public:
86     this(Range r)
87     {
88         source = r;
89         popFront;
90     }
91 
92     auto front() @property
93     {
94         return token;
95     }
96 
97     void popFront()
98     {
99         import std.utf : byCodeUnit;
100         import std.algorithm.searching : canFind, count;
101         import std.uni : isAlpha;
102         import std.ascii : isDigit;
103 
104         if (tokenStack.length > 0)
105         {
106             immutable f = tokenStack.front;
107             tokenStack.popFront;
108             token = f;
109             return;
110         }
111 
112         bool seenLetters = false;
113         State state = State.EMPTY;
114         token = string.init;
115 
116         while (!source.empty || !charStack.empty)
117         {
118             // We only realize that we've reached the end of a token when we
119             // find a character that's not part of the current token - since
120             // that character may be part of the next token, it's stored in the
121             // charStack.
122             char nextChar;
123 
124             if (!charStack.empty)
125             {
126                 nextChar = charStack[0];
127                 charStack = charStack[1 .. $];
128             }
129             else
130             {
131                 nextChar = source.front;
132                 source.popFront;
133             }
134 
135             if (state == State.EMPTY)
136             {
137                 debug(dateparser) writeln("EMPTY");
138                 // First character of the token - determines if we're starting
139                 // to parse a word, a number or something else.
140                 token ~= nextChar;
141 
142                 if (nextChar.isAlpha)
143                     state = State.ALPHA;
144                 else if (nextChar.isDigit)
145                     state = State.NUMERIC;
146                 else if (nextChar == ' ')
147                 {
148                     token = " ";
149                     break; //emit token
150                 }
151                 else
152                     break; //emit token
153                 debug(dateparser) writeln("TOKEN ", token, " STATE ", state);
154             }
155             else if (state == State.ALPHA)
156             {
157                 debug(dateparser) writeln("STATE ", state, " nextChar: ", nextChar);
158                 // If we've already started reading a word, we keep reading
159                 // letters until we find something that's not part of a word.
160                 seenLetters = true;
161 
162                 if (nextChar != '.' && nextChar != ',' &&
163                     nextChar != '/' && nextChar != '-' &&
164                     nextChar != '+' && nextChar != ' ' &&
165                     !nextChar.isDigit)
166                 {
167                     token ~= nextChar;
168                 }
169                 else if (nextChar == '.')
170                 {
171                     token ~= nextChar;
172                     state = State.ALPHA_PERIOD;
173                 }
174                 else
175                 {
176                     charStack ~= nextChar;
177                     break; //emit token
178                 }
179             }
180             else if (state == State.NUMERIC)
181             {
182                 // If we've already started reading a number, we keep reading
183                 // numbers until we find something that doesn't fit.
184                 debug(dateparser) writeln("STATE ", state, " nextChar: ", nextChar);
185                 if (nextChar.isDigit)
186                     token ~= nextChar;
187                 else if (nextChar == '.' || (nextChar == ',' && token.length >= 2))
188                 {
189                     token ~= nextChar;
190                     state = State.NUMERIC_PERIOD;
191                 }
192                 else
193                 {
194                     charStack ~= nextChar;
195                     debug(dateparser) writeln("charStack add: ", charStack);
196                     break; //emit token
197                 }
198             }
199             else if (state == State.ALPHA_PERIOD)
200             {
201                 debug(dateparser) writeln("STATE ", state, " nextChar: ", nextChar);
202                 // If we've seen some letters and a dot separator, continue
203                 // parsing, and the tokens will be broken up later.
204                 seenLetters = true;
205                 if (nextChar == '.' || nextChar.isAlpha)
206                 {
207                     token ~= nextChar;
208                 }
209                 else if (nextChar.isDigit && token[$ - 1] == '.')
210                 {
211                     token ~= nextChar;
212                     state = State.NUMERIC_PERIOD;
213                 }
214                 else
215                 {
216                     charStack ~= nextChar;
217                     break; //emit token
218                 }
219             }
220             else if (state == State.NUMERIC_PERIOD)
221             {
222                 debug(dateparser) writeln("STATE ", state, " nextChar: ", nextChar);
223                 // If we've seen at least one dot separator, keep going, we'll
224                 // break up the tokens later.
225                 if (nextChar == '.' || nextChar.isDigit)
226                     token ~= nextChar;
227                 else if (nextChar.isAlpha && token[$ - 1] == '.')
228                 {
229                     token ~= nextChar;
230                     state = State.ALPHA_PERIOD;
231                 }
232                 else
233                 {
234                     charStack ~= nextChar;
235                     break; //emit token
236                 }
237             }
238         }
239 
240         debug(dateparser) writeln("STATE ", state, " seenLetters: ", seenLetters);
241         if ((state == State.ALPHA_PERIOD || state == State.NUMERIC_PERIOD)
242                 && (seenLetters || token.byCodeUnit.count('.') > 1
243                 || (token[$ - 1] == '.' || token[$ - 1] == ',')))
244             if ((state == State.ALPHA_PERIOD
245                     || state == State.NUMERIC_PERIOD) && (seenLetters
246                     || token.byCodeUnit.count('.') > 1 || (token[$ - 1] == '.' || token[$ - 1] == ',')))
247             {
248                 auto l = splitterWithMatches(token[], split_decimal);
249                 token = l.front;
250                 l.popFront;
251 
252                 foreach (tok; l)
253                     if (tok.length > 0)
254                         tokenStack ~= tok;
255             }
256 
257         if (state == State.NUMERIC_PERIOD && !token.byCodeUnit.canFind('.'))
258             token = token.replace(",", ".");
259     }
260 
261     bool empty() @nogc @safe @property nothrow pure
262     {
263         return token.empty && source.empty && charStack.empty && tokenStack.empty;
264     }
265 }
266 
267 unittest
268 {
269     import std.algorithm.comparison : equal;
270     import std.utf : byCodeUnit;
271 
272     assert("Thu Sep 25 10:36:28 BRST 2003".byCodeUnit.timeLexer.equal(
273         ["Thu", " ", "Sep", " ", "25", " ",
274          "10", ":", "36", ":", "28", " ",
275          "BRST", " ", "2003"]
276     ));
277 
278     assert("2003-09-25T10:49:41.5-03:00".byCodeUnit.timeLexer.equal(
279         ["2003", "-", "09", "-", "25", "T",
280          "10", ":", "49", ":", "41.5", "-",
281          "03", ":", "00"]
282     ));
283 }
284 
285 unittest
286 {
287     import std.algorithm.comparison : equal;
288     import std.utf : byChar;
289 
290     assert("10:10"
291         .byChar
292         .timeLexer
293         .equal(["10", ":", "10"]));
294     assert("Thu Sep 10:36:28"
295         .byChar
296         .timeLexer
297         .equal(["Thu", " ", "Sep", " ", "10", ":", "36", ":", "28"]));
298 }