1 /** 2 * Boost Software License - Version 1.0 - August 17th, 2003 3 * 4 * Permission is hereby granted, free of charge, to any person or organization 5 * obtaining a copy of the software and accompanying documentation covered by 6 * this license (the "Software") to use, reproduce, display, distribute, 7 * execute, and transmit the Software, and to prepare derivative works of the 8 * Software, and to permit third-parties to whom the Software is furnished to 9 * do so, all subject to the following: 10 * 11 * The copyright notices in the Software and this entire statement, including 12 * the above license grant, this restriction and the following disclaimer, 13 * must be included in all copies of the Software, in whole or in part, and 14 * all derivative works of the Software, unless such copies or derivative 15 * works are solely in the form of machine-executable object code generated by 16 * a source language processor. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 21 * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 22 * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 23 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 * DEALINGS IN THE SOFTWARE. 25 */ 26 27 module dateparser.timelexer; 28 29 debug(dateparser) import std.stdio; 30 import std.range; 31 import std.traits; 32 import std.regex; 33 import dateparser.splitter; 34 35 private enum State 36 { 37 EMPTY, 38 ALPHA, 39 NUMERIC, 40 ALPHA_PERIOD, 41 PERIOD, 42 NUMERIC_PERIOD 43 } 44 45 package: 46 47 // Needs to be explicitly flagged global for the backwards compatible 48 // version of splitterWithMatches 49 enum split_decimal = ctRegex!(`([\.,])`, "g"); 50 51 /** 52 * This function breaks the time string into lexical units (tokens), which 53 * can be parsed by the parser. Lexical units are demarcated by changes in 54 * the character set, so any continuous string of letters is considered 55 * one unit, any continuous string of numbers is considered one unit. 56 * 57 * The main complication arises from the fact that dots ('.') can be used 58 * both as separators (e.g. "Sep.20.2009") or decimal points (e.g. 59 * "4:30:21.447"). As such, it is necessary to read the full context of 60 * any dot-separated strings before breaking it into tokens; as such, this 61 * function maintains a "token stack", for when the ambiguous context 62 * demands that multiple tokens be parsed at once. 63 * 64 * Params: 65 * r = the range to parse 66 * Returns: 67 * a input range of strings 68 */ 69 auto timeLexer(Range)(Range r) if (isInputRange!Range && is(Unqual!(ElementType!Range) == char)) 70 { 71 return TimeLexerResult!Range(r); 72 } 73 74 // Issue 15831: This should be a Voldemort type, but due to linker slowdown 75 // it's a good idea to put this outside so we don't slowdown people's build 76 // times 77 struct TimeLexerResult(Range) 78 { 79 private: 80 Range source; 81 string charStack; 82 string[] tokenStack; 83 string token; 84 85 public: 86 this(Range r) 87 { 88 source = r; 89 popFront; 90 } 91 92 auto front() @property 93 { 94 return token; 95 } 96 97 void popFront() 98 { 99 import std.utf : byCodeUnit; 100 import std.algorithm.searching : canFind, count; 101 import std.uni : isAlpha; 102 import std.ascii : isDigit; 103 104 if (tokenStack.length > 0) 105 { 106 immutable f = tokenStack.front; 107 tokenStack.popFront; 108 token = f; 109 return; 110 } 111 112 bool seenLetters = false; 113 State state = State.EMPTY; 114 token = string.init; 115 116 while (!source.empty || !charStack.empty) 117 { 118 // We only realize that we've reached the end of a token when we 119 // find a character that's not part of the current token - since 120 // that character may be part of the next token, it's stored in the 121 // charStack. 122 char nextChar; 123 124 if (!charStack.empty) 125 { 126 nextChar = charStack[0]; 127 charStack = charStack[1 .. $]; 128 } 129 else 130 { 131 nextChar = source.front; 132 source.popFront; 133 } 134 135 if (state == State.EMPTY) 136 { 137 debug(dateparser) writeln("EMPTY"); 138 // First character of the token - determines if we're starting 139 // to parse a word, a number or something else. 140 token ~= nextChar; 141 142 if (nextChar.isAlpha) 143 state = State.ALPHA; 144 else if (nextChar.isDigit) 145 state = State.NUMERIC; 146 else if (nextChar == ' ') 147 { 148 token = " "; 149 break; //emit token 150 } 151 else 152 break; //emit token 153 debug(dateparser) writeln("TOKEN ", token, " STATE ", state); 154 } 155 else if (state == State.ALPHA) 156 { 157 debug(dateparser) writeln("STATE ", state, " nextChar: ", nextChar); 158 // If we've already started reading a word, we keep reading 159 // letters until we find something that's not part of a word. 160 seenLetters = true; 161 162 if (nextChar != '.' && nextChar != ',' && 163 nextChar != '/' && nextChar != '-' && 164 nextChar != '+' && nextChar != ' ' && 165 !nextChar.isDigit) 166 { 167 token ~= nextChar; 168 } 169 else if (nextChar == '.') 170 { 171 token ~= nextChar; 172 state = State.ALPHA_PERIOD; 173 } 174 else 175 { 176 charStack ~= nextChar; 177 break; //emit token 178 } 179 } 180 else if (state == State.NUMERIC) 181 { 182 // If we've already started reading a number, we keep reading 183 // numbers until we find something that doesn't fit. 184 debug(dateparser) writeln("STATE ", state, " nextChar: ", nextChar); 185 if (nextChar.isDigit) 186 token ~= nextChar; 187 else if (nextChar == '.' || (nextChar == ',' && token.length >= 2)) 188 { 189 token ~= nextChar; 190 state = State.NUMERIC_PERIOD; 191 } 192 else 193 { 194 charStack ~= nextChar; 195 debug(dateparser) writeln("charStack add: ", charStack); 196 break; //emit token 197 } 198 } 199 else if (state == State.ALPHA_PERIOD) 200 { 201 debug(dateparser) writeln("STATE ", state, " nextChar: ", nextChar); 202 // If we've seen some letters and a dot separator, continue 203 // parsing, and the tokens will be broken up later. 204 seenLetters = true; 205 if (nextChar == '.' || nextChar.isAlpha) 206 { 207 token ~= nextChar; 208 } 209 else if (nextChar.isDigit && token[$ - 1] == '.') 210 { 211 token ~= nextChar; 212 state = State.NUMERIC_PERIOD; 213 } 214 else 215 { 216 charStack ~= nextChar; 217 break; //emit token 218 } 219 } 220 else if (state == State.NUMERIC_PERIOD) 221 { 222 debug(dateparser) writeln("STATE ", state, " nextChar: ", nextChar); 223 // If we've seen at least one dot separator, keep going, we'll 224 // break up the tokens later. 225 if (nextChar == '.' || nextChar.isDigit) 226 token ~= nextChar; 227 else if (nextChar.isAlpha && token[$ - 1] == '.') 228 { 229 token ~= nextChar; 230 state = State.ALPHA_PERIOD; 231 } 232 else 233 { 234 charStack ~= nextChar; 235 break; //emit token 236 } 237 } 238 } 239 240 debug(dateparser) writeln("STATE ", state, " seenLetters: ", seenLetters); 241 if ((state == State.ALPHA_PERIOD || state == State.NUMERIC_PERIOD) 242 && (seenLetters || token.byCodeUnit.count('.') > 1 243 || (token[$ - 1] == '.' || token[$ - 1] == ','))) 244 if ((state == State.ALPHA_PERIOD 245 || state == State.NUMERIC_PERIOD) && (seenLetters 246 || token.byCodeUnit.count('.') > 1 || (token[$ - 1] == '.' || token[$ - 1] == ','))) 247 { 248 auto l = splitterWithMatches(token[], split_decimal); 249 token = l.front; 250 l.popFront; 251 252 foreach (tok; l) 253 if (tok.length > 0) 254 tokenStack ~= tok; 255 } 256 257 if (state == State.NUMERIC_PERIOD && !token.byCodeUnit.canFind('.')) 258 token = token.replace(",", "."); 259 } 260 261 bool empty() @nogc @safe @property nothrow pure 262 { 263 return token.empty && source.empty && charStack.empty && tokenStack.empty; 264 } 265 } 266 267 unittest 268 { 269 import std.algorithm.comparison : equal; 270 import std.utf : byCodeUnit; 271 272 assert("Thu Sep 25 10:36:28 BRST 2003".byCodeUnit.timeLexer.equal( 273 ["Thu", " ", "Sep", " ", "25", " ", 274 "10", ":", "36", ":", "28", " ", 275 "BRST", " ", "2003"] 276 )); 277 278 assert("2003-09-25T10:49:41.5-03:00".byCodeUnit.timeLexer.equal( 279 ["2003", "-", "09", "-", "25", "T", 280 "10", ":", "49", ":", "41.5", "-", 281 "03", ":", "00"] 282 )); 283 } 284 285 unittest 286 { 287 import std.algorithm.comparison : equal; 288 import std.utf : byChar; 289 290 assert("10:10" 291 .byChar 292 .timeLexer 293 .equal(["10", ":", "10"])); 294 assert("Thu Sep 10:36:28" 295 .byChar 296 .timeLexer 297 .equal(["Thu", " ", "Sep", " ", "10", ":", "36", ":", "28"])); 298 }