1 /** 2 * Boost Software License - Version 1.0 - August 17th, 2003 3 * 4 * Permission is hereby granted, free of charge, to any person or organization 5 * obtaining a copy of the software and accompanying documentation covered by 6 * this license (the "Software") to use, reproduce, display, distribute, 7 * execute, and transmit the Software, and to prepare derivative works of the 8 * Software, and to permit third-parties to whom the Software is furnished to 9 * do so, all subject to the following: 10 * 11 * The copyright notices in the Software and this entire statement, including 12 * the above license grant, this restriction and the following disclaimer, 13 * must be included in all copies of the Software, in whole or in part, and 14 * all derivative works of the Software, unless such copies or derivative 15 * works are solely in the form of machine-executable object code generated by 16 * a source language processor. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 21 * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 22 * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 23 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 * DEALINGS IN THE SOFTWARE. 25 */ 26 27 module dateparser.timelexer; 28 29 debug(dateparser) import std.stdio; 30 import std.range; 31 import std.traits; 32 import std.regex; 33 import dateparser.splitter; 34 35 package: 36 37 // Needs to be explicitly flagged global for the backwards compatible 38 // version of splitterWithMatches 39 enum split_decimal = ctRegex!(`([\.,])`, "g"); 40 41 /** 42 * This function breaks the time string into lexical units (tokens), which 43 * can be parsed by the parser. Lexical units are demarcated by changes in 44 * the character set, so any continuous string of letters is considered 45 * one unit, any continuous string of numbers is considered one unit. 46 * 47 * The main complication arises from the fact that dots ('.') can be used 48 * both as separators (e.g. "Sep.20.2009") or decimal points (e.g. 49 * "4:30:21.447"). As such, it is necessary to read the full context of 50 * any dot-separated strings before breaking it into tokens; as such, this 51 * function maintains a "token stack", for when the ambiguous context 52 * demands that multiple tokens be parsed at once. 53 * 54 * Params: 55 * r = the range to parse 56 * Returns: 57 * a input range of strings 58 */ 59 auto timeLexer(Range)(Range r) if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range)) 60 { 61 return TimeLexerResult!Range(r); 62 } 63 64 // Issue 15831: This should be a Voldemort type, but due to linker slowdown 65 // it's a good idea to put this outside so we don't slowdown people's build 66 // times 67 struct TimeLexerResult(Range) 68 { 69 private: 70 Range source; 71 string charStack; 72 string[] tokenStack; 73 string token; 74 enum State 75 { 76 EMPTY, 77 ALPHA, 78 NUMERIC, 79 ALPHA_PERIOD, 80 PERIOD, 81 NUMERIC_PERIOD 82 } 83 84 public: 85 this(Range r) 86 { 87 source = r; 88 popFront(); 89 } 90 91 auto front() @property 92 { 93 return token; 94 } 95 96 void popFront() 97 { 98 import std.algorithm.searching : canFind, count; 99 import std.uni : isAlpha; 100 101 if (tokenStack.length > 0) 102 { 103 immutable f = tokenStack.front; 104 tokenStack.popFront; 105 token = f; 106 return; 107 } 108 109 bool seenLetters = false; 110 State state = State.EMPTY; 111 token = string.init; 112 113 while (!source.empty || !charStack.empty) 114 { 115 // We only realize that we've reached the end of a token when we 116 // find a character that's not part of the current token - since 117 // that character may be part of the next token, it's stored in the 118 // charStack. 119 uint nextChar; 120 121 if (!charStack.empty) 122 { 123 nextChar = cast(uint) charStack.front; 124 charStack.popFront; 125 } 126 else 127 { 128 nextChar = cast(uint) source.front; 129 source.popFront; 130 } 131 132 if (state == State.EMPTY) 133 { 134 debug(dateparser) writeln("EMPTY"); 135 // First character of the token - determines if we're starting 136 // to parse a word, a number or something else. 137 token ~= cast(char) nextChar; 138 139 if (nextChar.isAlpha) 140 state = State.ALPHA; 141 else if (nextChar.isNumber) 142 state = State.NUMERIC; 143 else if (nextChar == ' ') 144 { 145 token = " "; 146 break; //emit token 147 } 148 else 149 break; //emit token 150 debug(dateparser) writeln("TOKEN ", token, " STATE ", state); 151 } 152 else if (state == State.ALPHA) 153 { 154 debug(dateparser) writeln("STATE ", state, " nextChar: ", nextChar); 155 // If we've already started reading a word, we keep reading 156 // letters until we find something that's not part of a word. 157 seenLetters = true; 158 159 if (nextChar != '.' && nextChar != ',' && 160 nextChar != '/' && nextChar != '-' && 161 nextChar != ' ' && !nextChar.isNumber) 162 { 163 token ~= cast(char) nextChar; 164 } 165 else if (nextChar == '.') 166 { 167 token ~= cast(char) nextChar; 168 state = State.ALPHA_PERIOD; 169 } 170 else 171 { 172 charStack ~= cast(char) nextChar; 173 break; //emit token 174 } 175 } 176 else if (state == State.NUMERIC) 177 { 178 // If we've already started reading a number, we keep reading 179 // numbers until we find something that doesn't fit. 180 debug(dateparser) writeln("STATE ", state, " nextChar: ", nextChar); 181 if (nextChar.isNumber) 182 token ~= cast(char) nextChar; 183 else if (nextChar == '.' || (nextChar == ',' && token.length >= 2)) 184 { 185 token ~= cast(char) nextChar; 186 state = State.NUMERIC_PERIOD; 187 } 188 else 189 { 190 charStack ~= cast(char) nextChar; 191 debug(dateparser) writeln("charStack add: ", charStack); 192 break; //emit token 193 } 194 } 195 else if (state == State.ALPHA_PERIOD) 196 { 197 debug(dateparser) writeln("STATE ", state, " nextChar: ", nextChar); 198 // If we've seen some letters and a dot separator, continue 199 // parsing, and the tokens will be broken up later. 200 seenLetters = true; 201 if (nextChar == '.' || nextChar.isAlpha) 202 token ~= cast(char) nextChar; 203 else if (nextChar.isNumber && token[$ - 1] == '.') 204 { 205 token ~= cast(char) nextChar; 206 state = State.NUMERIC_PERIOD; 207 } 208 else 209 { 210 charStack ~= cast(char) nextChar; 211 break; //emit token 212 } 213 } 214 else if (state == State.NUMERIC_PERIOD) 215 { 216 debug(dateparser) writeln("STATE ", state, " nextChar: ", nextChar); 217 // If we've seen at least one dot separator, keep going, we'll 218 // break up the tokens later. 219 if (nextChar == '.' || nextChar.isNumber) 220 token ~= cast(char) nextChar; 221 else if (nextChar.isAlpha && token[$ - 1] == '.') 222 { 223 token ~= cast(char) nextChar; 224 state = State.ALPHA_PERIOD; 225 } 226 else 227 { 228 charStack ~= cast(char) nextChar; 229 break; //emit token 230 } 231 } 232 } 233 234 debug(dateparser) writeln("STATE ", state, " seenLetters: ", seenLetters); 235 if ((state == State.ALPHA_PERIOD || state == State.NUMERIC_PERIOD) 236 && (seenLetters || token.count('.') > 1 237 || (token[$ - 1] == '.' || token[$ - 1] == ','))) 238 if ((state == State.ALPHA_PERIOD 239 || state == State.NUMERIC_PERIOD) && (seenLetters 240 || token.count('.') > 1 || (token[$ - 1] == '.' || token[$ - 1] == ','))) 241 { 242 auto l = splitterWithMatches(token[], split_decimal); 243 token = l.front; 244 l.popFront; 245 246 foreach (tok; l) 247 if (tok.length > 0) 248 tokenStack ~= tok; 249 } 250 251 if (state == State.NUMERIC_PERIOD && !token.canFind('.')) 252 token = token.replace(",", "."); 253 } 254 255 bool empty()() @property 256 { 257 return token.empty && source.empty && charStack.empty && tokenStack.empty; 258 } 259 } 260 261 unittest 262 { 263 import std.algorithm.comparison : equal; 264 265 assert("Thu Sep 25 10:36:28 BRST 2003".timeLexer.equal( 266 ["Thu", " ", "Sep", " ", "25", " ", 267 "10", ":", "36", ":", "28", " ", 268 "BRST", " ", "2003"] 269 )); 270 271 assert("2003-09-25T10:49:41.5-03:00".timeLexer.equal( 272 ["2003", "-", "09", "-", "25", "T", 273 "10", ":", "49", ":", "41.5", "-", 274 "03", ":", "00"] 275 )); 276 } 277 278 /++ 279 Params: c = The character to test. 280 Returns: Whether `c` is a number (0..9). 281 +/ 282 pragma(inline, true) 283 bool isNumber(dchar c) @safe pure nothrow @nogc 284 { 285 return c >= '0' && c <= '9'; 286 } 287 288 unittest 289 { 290 import std.internal.test.dummyrange : ReferenceInputRange; 291 import std.algorithm.comparison : equal; 292 293 auto a = new ReferenceInputRange!dchar("10:10"); 294 assert(a.timeLexer.equal(["10", ":", "10"])); 295 296 auto b = new ReferenceInputRange!dchar("Thu Sep 10:36:28"); 297 assert(b.timeLexer.equal(["Thu", " ", "Sep", " ", "10", ":", "36", ":", "28"])); 298 }