PageRenderTime 68ms CodeModel.GetById 16ms RepoModel.GetById 1ms app.codeStats 0ms

/dsplit.d

http://github.com/CyberShadow/DustMite
D | 528 lines | 500 code | 22 blank | 6 comment | 24 complexity | 48e8a3d8f7ab655e48e11d67d3e19989 MD5 | raw file
  1. /// Very simplistic D source code "parser"
  2. /// Written by Vladimir Panteleev <vladimir@thecybershadow.net>
  3. /// Released into the Public Domain
  4. module dsplit;
  5. import std.file;
  6. import std.path;
  7. import std.string;
  8. import std.ascii;
  9. import std.array;
  10. debug import std.stdio;
  11. class Entity
  12. {
  13. string head;
  14. Entity[] children;
  15. string tail;
  16. string filename, contents;
  17. @property bool isFile() { return filename != ""; }
  18. bool isPair; /// internal hint
  19. bool noRemove; /// don't try removing this entity (children OK)
  20. bool removed; /// For dangling dependencies
  21. Entity[] dependencies;
  22. int id; /// For diagnostics
  23. size_t descendants; /// For progress display
  24. this(string head = null, Entity[] children = null, string tail = null, string filename = null, bool isPair = false)
  25. {
  26. this.head = head;
  27. this.children = children;
  28. this.tail = tail;
  29. this.filename = filename;
  30. this.isPair = isPair;
  31. }
  32. }
  33. struct ParseOptions
  34. {
  35. enum Mode { Source, Words }
  36. bool stripComments;
  37. Mode mode;
  38. }
  39. Entity loadFiles(ref string path, ParseOptions options)
  40. {
  41. if (isFile(path))
  42. {
  43. auto filePath = path;
  44. path = stripExtension(path);
  45. return loadFile(baseName(filePath).replace(`\`, `/`), filePath, options);
  46. }
  47. else
  48. {
  49. auto set = new Entity();
  50. foreach (string entry; dirEntries(path, SpanMode.breadth))
  51. if (isFile(entry))
  52. {
  53. assert(entry.startsWith(path));
  54. auto name = entry[path.length+1..$];
  55. set.children ~= loadFile(name, entry, options);
  56. }
  57. return set;
  58. }
  59. }
  60. enum BIN_SIZE = 2;
  61. void optimize(Entity set)
  62. {
  63. static void group(ref Entity[] set, size_t start, size_t end)
  64. {
  65. //set = set[0..start] ~ [new Entity(removable, set[start..end])] ~ set[end..$];
  66. set.replaceInPlace(start, end, [new Entity(null, set[start..end].dup, null)]);
  67. }
  68. static void clusterBy(ref Entity[] set, size_t binSize)
  69. {
  70. while (set.length > binSize)
  71. {
  72. auto size = set.length >= binSize*2 ? binSize : (set.length+1) / 2;
  73. //auto size = binSize;
  74. auto bins = set.length/size;
  75. if (set.length % size > 1)
  76. group(set, bins*size, set.length);
  77. foreach_reverse (i; 0..bins)
  78. group(set, i*size, (i+1)*size);
  79. }
  80. }
  81. static void doOptimize(Entity e)
  82. {
  83. foreach (c; e.children)
  84. doOptimize(c);
  85. clusterBy(e.children, BIN_SIZE);
  86. }
  87. doOptimize(set);
  88. }
  89. private:
  90. Entity loadFile(string name, string path, ParseOptions options)
  91. {
  92. debug writeln("Loading ", path);
  93. auto result = new Entity();
  94. result.filename = name.replace(`\`, `/`);
  95. result.contents = cast(string)read(path);
  96. if (options.stripComments)
  97. if (extension(path) == ".d" || extension(path) == ".di")
  98. result.contents = stripDComments(result.contents);
  99. final switch (options.mode)
  100. {
  101. case ParseOptions.Mode.Source:
  102. switch (extension(path))
  103. {
  104. case ".d":
  105. case ".di":
  106. result.children = parseD(result.contents); return result;
  107. // One could add custom splitters for other languages here - for example, a simple line/word/character splitter for most text-based formats
  108. default:
  109. result.children = [new Entity(result.contents, null, null)]; return result;
  110. }
  111. case ParseOptions.Mode.Words:
  112. result.children = parseToWords(result.contents); return result;
  113. }
  114. }
  115. string skipSymbol(string s, ref size_t i)
  116. {
  117. auto start = i;
  118. switch (s[i])
  119. {
  120. case '\'':
  121. i++;
  122. if (s[i] == '\\')
  123. i+=2;
  124. while (s[i] != '\'')
  125. i++;
  126. i++;
  127. break;
  128. case '\\':
  129. i+=2;
  130. break;
  131. case '"':
  132. if (i && s[i-1] == 'r')
  133. {
  134. i++;
  135. while (s[i] != '"')
  136. i++;
  137. i++;
  138. }
  139. else
  140. {
  141. i++;
  142. while (s[i] != '"')
  143. {
  144. if (s[i] == '\\')
  145. i+=2;
  146. else
  147. i++;
  148. }
  149. i++;
  150. }
  151. break;
  152. case '`':
  153. i++;
  154. while (s[i] != '`')
  155. i++;
  156. i++;
  157. break;
  158. case '/':
  159. i++;
  160. if (i==s.length)
  161. break;
  162. else
  163. if (s[i] == '/')
  164. {
  165. while (i < s.length && s[i] != '\r' && s[i] != '\n')
  166. i++;
  167. }
  168. else
  169. if (s[i] == '*')
  170. {
  171. i+=3;
  172. while (s[i-2] != '*' || s[i-1] != '/')
  173. i++;
  174. }
  175. else
  176. if (s[i] == '+')
  177. {
  178. i++;
  179. int commentLevel = 1;
  180. while (commentLevel)
  181. {
  182. if (s[i] == '/' && s[i+1]=='+')
  183. commentLevel++, i+=2;
  184. else
  185. if (s[i] == '+' && s[i+1]=='/')
  186. commentLevel--, i+=2;
  187. else
  188. i++;
  189. }
  190. }
  191. else
  192. i++;
  193. break;
  194. default:
  195. i++;
  196. break;
  197. }
  198. return s[start..i];
  199. }
  200. /// Moves i forward over first series of EOL characters, or until first non-whitespace character
  201. void skipToEOL(string s, ref size_t i)
  202. {
  203. while (i < s.length)
  204. {
  205. if (s[i] == '\r' || s[i] == '\n')
  206. {
  207. while (i < s.length && (s[i] == '\r' || s[i] == '\n'))
  208. i++;
  209. return;
  210. }
  211. else
  212. if (isWhite(s[i]))
  213. i++;
  214. else
  215. if (s[i..$].startsWith("//"))
  216. skipSymbol(s, i);
  217. else
  218. break;
  219. }
  220. }
  221. /// Moves i backwards to the beginning of the current line, but not any further than start
  222. void backToEOL(string s, ref size_t i, size_t start)
  223. {
  224. while (i>start && isWhite(s[i-1]) && s[i-1] != '\n')
  225. i--;
  226. }
  227. Entity[] parseD(string s)
  228. {
  229. size_t i = 0;
  230. size_t start;
  231. string innerTail;
  232. Entity[] parseScope(char end)
  233. {
  234. // Here be dragons.
  235. enum MAX_SPLITTER_LEVELS = 5;
  236. struct DSplitter { char open, close, sep; }
  237. static const DSplitter[MAX_SPLITTER_LEVELS] splitters = [{'{','}',';'}, {'(',')'}, {'[',']'}, {sep:','}, {sep:' '}];
  238. Entity[][MAX_SPLITTER_LEVELS] splitterQueue;
  239. Entity[] terminateLevel(int level)
  240. {
  241. if (level == MAX_SPLITTER_LEVELS)
  242. {
  243. auto text = s[start..i];
  244. start = i;
  245. return splitText(text);
  246. }
  247. else
  248. {
  249. auto next = terminateLevel(level+1);
  250. if (next.length <= 1)
  251. splitterQueue[level] ~= next;
  252. else
  253. splitterQueue[level] ~= new Entity(null, next, null);
  254. auto r = splitterQueue[level];
  255. splitterQueue[level] = null;
  256. return r;
  257. }
  258. }
  259. string terminateText()
  260. {
  261. auto r = s[start..i];
  262. start = i;
  263. return r;
  264. }
  265. characterLoop:
  266. while (i < s.length)
  267. {
  268. char c = s[i];
  269. foreach (int level, info; splitters)
  270. if (info.sep && c == info.sep)
  271. {
  272. auto children = terminateLevel(level+1);
  273. assert(i == start);
  274. i++; skipToEOL(s, i);
  275. splitterQueue[level] ~= new Entity(null, children, terminateText());
  276. continue characterLoop;
  277. }
  278. else
  279. if (info.open && c == info.open)
  280. {
  281. auto openPos = i;
  282. backToEOL(s, i, start);
  283. auto pairHead = terminateLevel(level+1);
  284. i = openPos+1; skipToEOL(s, i);
  285. auto startSequence = terminateText();
  286. auto bodyContents = parseScope(info.close);
  287. auto pairBody = new Entity(startSequence, bodyContents, innerTail);
  288. if (pairHead.length == 0)
  289. splitterQueue[level] ~= pairBody;
  290. else
  291. if (pairHead.length == 1)
  292. splitterQueue[level] ~= new Entity(null, pairHead ~ pairBody, null, null, true);
  293. else
  294. splitterQueue[level] ~= new Entity(null, [new Entity(null, pairHead, null), pairBody], null, null, true);
  295. continue characterLoop;
  296. }
  297. if (end && c == end)
  298. {
  299. auto closePos = i;
  300. backToEOL(s, i, start);
  301. auto result = terminateLevel(0);
  302. i = closePos+1; skipToEOL(s, i);
  303. innerTail = terminateText();
  304. return result;
  305. }
  306. else
  307. skipSymbol(s, i);
  308. }
  309. innerTail = null;
  310. return terminateLevel(0);
  311. }
  312. auto result = parseScope(0);
  313. postProcessD(result);
  314. return result;
  315. }
  316. string stripDComments(string s)
  317. {
  318. auto result = appender!string();
  319. size_t i = 0;
  320. while (i < s.length)
  321. {
  322. auto sym = skipSymbol(s, i);
  323. if (!sym.startsWithComment())
  324. result.put(sym);
  325. }
  326. return result.data;
  327. }
  328. void postProcessD(ref Entity[] entities)
  329. {
  330. for (int i=0; i<entities.length;)
  331. {
  332. // Add dependencies for comma-separated lists.
  333. if (i+2 <= entities.length && entities[i].children.length >= 1 && entities[i].tail.stripD() == ",")
  334. {
  335. auto comma = new Entity(entities[i].tail);
  336. entities[i].children ~= comma;
  337. entities[i].tail = null;
  338. comma.dependencies ~= [entities[i].children[$-2], getHeadEntity(entities[i+1])];
  339. }
  340. // Group together consecutive entities which might represent a single language construct
  341. // There is no penalty for false positives, so accuracy is not very important
  342. if (i+2 <= entities.length && entities.length > 2 && (
  343. (getHeadText(entities[i]).startsWithWord("do") && getHeadText(entities[i+1]).isWord("while"))
  344. || (getHeadText(entities[i]).startsWithWord("try") && getHeadText(entities[i+1]).startsWithWord("catch"))
  345. || (getHeadText(entities[i]).startsWithWord("try") && getHeadText(entities[i+1]).startsWithWord("finally"))
  346. || (getHeadText(entities[i+1]).isWord("in"))
  347. || (getHeadText(entities[i+1]).isWord("out"))
  348. || (getHeadText(entities[i+1]).isWord("body"))
  349. ))
  350. {
  351. entities.replaceInPlace(i, i+2, [new Entity(null, entities[i..i+2].dup, null)]);
  352. continue;
  353. }
  354. postProcessD(entities[i].children);
  355. i++;
  356. }
  357. }
  358. const bool[string] wordsToSplit;
  359. static this() { wordsToSplit = ["else":true]; }
  360. Entity[] splitText(string s)
  361. {
  362. Entity[] result;
  363. while (s.length)
  364. {
  365. auto word = firstWord(s);
  366. if (word in wordsToSplit)
  367. {
  368. size_t p = word.ptr + word.length - s.ptr;
  369. skipToEOL(s, p);
  370. result ~= new Entity(s[0..p], null, null);
  371. s = s[p..$];
  372. }
  373. else
  374. {
  375. result ~= new Entity(s, null, null);
  376. s = null;
  377. }
  378. }
  379. return result;
  380. }
  381. string stripD(string s)
  382. {
  383. size_t i=0;
  384. size_t start=s.length, end=s.length;
  385. while (i < s.length)
  386. {
  387. if (s[i..$].startsWithComment())
  388. skipSymbol(s, i);
  389. else
  390. if (!isWhite(s[i]))
  391. {
  392. if (start > i)
  393. start = i;
  394. skipSymbol(s, i);
  395. end = i;
  396. }
  397. else
  398. i++;
  399. }
  400. return s[start..end];
  401. }
  402. string firstWord(string s)
  403. {
  404. size_t i = 0;
  405. s = stripD(s);
  406. while (i<s.length && !isWhite(s[i]))
  407. i++;
  408. return s[0..i];
  409. }
  410. bool startsWithWord(string s, string word)
  411. {
  412. s = stripD(s);
  413. return s.startsWith(word) && (s.length == word.length || !isAlphaNum(s[word.length]));
  414. }
  415. bool endsWithWord(string s, string word)
  416. {
  417. s = stripD(s);
  418. return s.endsWith(word) && (s.length == word.length || !isAlphaNum(s[$-word.length-1]));
  419. }
  420. bool isWord(string s, string word)
  421. {
  422. return stripD(s) == word;
  423. }
  424. bool startsWithComment(string s)
  425. {
  426. return s.startsWith("//") || s.startsWith("/*") || s.startsWith("/+");
  427. }
  428. Entity getHeadEntity(Entity e)
  429. {
  430. if (e.head.length)
  431. return e;
  432. foreach (child; e.children)
  433. {
  434. Entity r = getHeadEntity(child);
  435. if (r)
  436. return r;
  437. }
  438. if (e.tail.length)
  439. return e;
  440. return null;
  441. }
  442. string getHeadText(Entity e)
  443. {
  444. e = getHeadEntity(e);
  445. if (!e)
  446. return null;
  447. if (e.head)
  448. return e.head;
  449. return e.tail;
  450. }
  451. // ParseOptions.Mode.Words
  452. bool isDWordChar(char c)
  453. {
  454. return isAlphaNum(c) || c=='_' || c=='@';
  455. }
  456. public Entity[] parseToWords(string text)
  457. {
  458. Entity[] result;
  459. size_t i, wordStart, wordEnd;
  460. for (i = 1; i <= text.length; i++)
  461. if (i==text.length || (!isDWordChar(text[i-1]) && isDWordChar(text[i])))
  462. {
  463. if (wordStart != i)
  464. result ~= new Entity(text[wordStart..wordEnd], null, text[wordEnd..i]);
  465. wordStart = wordEnd = i;
  466. }
  467. else
  468. if ((isDWordChar(text[i-1]) && !isDWordChar(text[i])))
  469. wordEnd = i;
  470. return result;
  471. }