PageRenderTime 65ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 1ms

/.config/sublime-text-2/Packages/SublimeCodeIntel/libs/textinfo.py

https://bitbucket.org/ecool/dotfiles
Python | 2000 lines | 1838 code | 29 blank | 133 comment | 117 complexity | 341968e1e11182fd2f1816f7166ba520 MD5 | raw file
Possible License(s): MIT, BSD-3-Clause, MPL-2.0-no-copyleft-exception
  1. #!/usr/bin/env python
  2. # ***** BEGIN LICENSE BLOCK *****
  3. # Version: MPL 1.1/GPL 2.0/LGPL 2.1
  4. #
  5. # The contents of this file are subject to the Mozilla Public License
  6. # Version 1.1 (the "License"); you may not use this file except in
  7. # compliance with the License. You may obtain a copy of the License at
  8. # http://www.mozilla.org/MPL/
  9. #
  10. # Software distributed under the License is distributed on an "AS IS"
  11. # basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
  12. # License for the specific language governing rights and limitations
  13. # under the License.
  14. #
  15. # The Original Code is Komodo code.
  16. #
  17. # The Initial Developer of the Original Code is ActiveState Software Inc.
  18. # Portions created by ActiveState Software Inc are Copyright (C) 2000-2007
  19. # ActiveState Software Inc. All Rights Reserved.
  20. #
  21. # Contributor(s):
  22. # ActiveState Software Inc
  23. #
  24. # Alternatively, the contents of this file may be used under the terms of
  25. # either the GNU General Public License Version 2 or later (the "GPL"), or
  26. # the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  27. # in which case the provisions of the GPL or the LGPL are applicable instead
  28. # of those above. If you wish to allow use of your version of this file only
  29. # under the terms of either the GPL or the LGPL, and not to allow others to
  30. # use your version of this file under the terms of the MPL, indicate your
  31. # decision by deleting the provisions above and replace them with the notice
  32. # and other provisions required by the GPL or the LGPL. If you do not delete
  33. # the provisions above, a recipient may use your version of this file under
  34. # the terms of any one of the MPL, the GPL or the LGPL.
  35. #
  36. # ***** END LICENSE BLOCK *****
  37. r"""Determine information about text files.
  38. This module efficiently determines the encoding of text files (see
  39. _classify_encoding for details), accurately identifies binary files, and
  40. provides detailed meta information of text files.
  41. >>> import textinfo
  42. >>> path = __file__
  43. >>> if path.endswith(".pyc"): path = path[:-1]
  44. >>> ti = textinfo.textinfo_from_path(path)
  45. >>> ti.__class__
  46. <class 'textinfo.TextInfo'>
  47. >>> ti.encoding
  48. 'utf-8'
  49. >>> ti.file_type_name
  50. 'regular file'
  51. >>> ti.is_text
  52. True
  53. >>> ti.lang
  54. 'Python'
  55. >>> ti.langinfo
  56. <Python LangInfo>
  57. ...plus a number of other useful information gleaned from the file. To see
  58. a list of all useful attributes see
  59. >> list(ti.as_dict().keys())
  60. ['encoding', 'file_type', ...]
  61. Note: This module requires at least Python 2.5 to use
  62. `codecs.lookup(<encname>).name`.
  63. """
  64. _cmdln_doc = """Determine information about text files.
  65. """
  66. # TODO:
  67. # - [high prio] prefs integration
  68. # - aggegrate "is there an explicit encoding decl in this file" from XML, HTML,
  69. # lang-specific, emacs and vi vars decls (as discussed with Shane)
  70. # - fix ti with unicode paths Windows (check on Linux too)
  71. # - '-L|--dereference' option a la `file` and `ls`
  72. # - See: http://webblaze.cs.berkeley.edu/2009/content-sniffing/
  73. # - Shift-JIS encoding is not detected for
  74. # http://public.activestate.com/pub/apc/perl-current/lib/Pod/Simple/t/corpus/s2763_sjis.txt
  75. # [Jan wrote]
  76. # > While the document isn't identified by filename extension as POD,
  77. # > it does contain POD and a corresponding =encoding directive.
  78. # Could potentially have a content heuristic check for POD.
  79. #
  80. # ----------------
  81. # Current Komodo (4.2) Encoding Determination Notes (used for reference,
  82. # but not absolutely followed):
  83. #
  84. # Working through koDocumentBase._detectEncoding:
  85. # encoding_name = pref:encodingDefault (on first start is set
  86. # to encoding from locale.getdefaultlocale() typically,
  87. # fallback to iso8859-1; default locale typically ends up being:
  88. # Windows: cp1252
  89. # Mac OS X: mac-roman
  90. # (modern) Linux: UTF-8)
  91. # encoding = the python name for this
  92. # tryencoding = pref:encoding (no default, explicitly set
  93. # encoding) -- i.e. if there are doc prefs for this
  94. # path, then give this encoding a try. If not given,
  95. # then utf-8 for XML/XSLT/VisualBasic and
  96. # pref:encodingDefault for others (though this is
  97. # all prefable via the 'languages' pref struct).
  98. # tryxmldecl
  99. # trymeta (HTML meta)
  100. # trymodeline
  101. # autodetect (whether to try at all)
  102. #
  103. # if autodetect or tryencoding:
  104. # koUnicodeEncoding.autoDetectEncoding()
  105. # else:
  106. # if encoding.startswith('utf'): # note this is pref:encodingDefault
  107. # check bom
  108. # presume encoding is right (give up if conversion fails)
  109. # else:
  110. # presume encoding is right (given up if fails)
  111. #
  112. # Working through koUnicodeEncoding.autoDetectEncoding:
  113. # if tryxmldecl: ...
  114. # if tryhtmlmeta: ...
  115. # if trymodeline: ...
  116. # use bom: ...
  117. # ----------------
  118. __version_info__ = (0, 1, 0)
  119. __version__ = '.'.join(map(str, __version_info__))
  120. import os
  121. from os.path import join, dirname, abspath, basename, exists
  122. import sys
  123. import re
  124. from pprint import pprint
  125. import traceback
  126. import warnings
  127. import logging
  128. import optparse
  129. import codecs
  130. import locale
  131. import langinfo
  132. #---- exceptions and warnings
  133. class TextInfoError(Exception):
  134. pass
  135. class TextInfoConfigError(TextInfoError):
  136. pass
  137. class ChardetImportWarning(ImportWarning):
  138. pass
  139. warnings.simplefilter("once", ChardetImportWarning)
  140. #---- globals
  141. log = logging.getLogger("textinfo")
  142. # For debugging:
  143. DEBUG_CHARDET_INFO = False # gather chardet info
  144. #---- module API
  145. def textinfo_from_filename(path):
  146. """Determine test info for the given path **using the filename only**.
  147. No attempt is made to stat or read the file.
  148. """
  149. return TextInfo.init_from_filename(path)
  150. def textinfo_from_path(path, encoding=None, follow_symlinks=False,
  151. quick_determine_lang=False):
  152. """Determine text info for the given path.
  153. This raises EnvironmentError if the path doesn't not exist or could
  154. not be read.
  155. """
  156. return TextInfo.init_from_path(path, encoding=encoding,
  157. follow_symlinks=follow_symlinks,
  158. quick_determine_lang=quick_determine_lang)
  159. #---- main TextInfo class
  160. class TextInfo(object):
  161. path = None
  162. file_type_name = None # e.g. "regular file", "directory", ...
  163. file_type = None # stat.S_IFMT(os.stat(path).st_mode)
  164. file_mode = None # stat.S_IMODE(os.stat(path).st_mode)
  165. is_text = None
  166. encoding = None
  167. has_bom = None # whether the text has a BOM (Byte Order Marker)
  168. encoding_bozo = False
  169. encoding_bozo_reasons = None
  170. lang = None # e.g. "Python", "Perl", ...
  171. langinfo = None # langinfo.LangInfo instance or None
  172. # Enable chardet-based heuristic guessing of encoding as a last
  173. # resort for file types known to not be binary.
  174. CHARDET_ENABLED = True
  175. CHARDET_THRESHHOLD = 0.9 # >=90% confidence to avoid false positives.
  176. @classmethod
  177. def init_from_filename(cls, path, lidb=None):
  178. """Create an instance using only the filename to initialize."""
  179. if lidb is None:
  180. lidb = langinfo.get_default_database()
  181. self = cls()
  182. self.path = path
  183. self._classify_from_filename(lidb)
  184. return self
  185. @classmethod
  186. def init_from_path(cls, path, encoding=None, lidb=None,
  187. follow_symlinks=False,
  188. quick_determine_lang=False,
  189. env=None):
  190. """Create an instance using the filename and stat/read info
  191. from the given path to initialize.
  192. @param follow_symlinks {boolean} can be set to True to have
  193. the textinfo returned for a symlink be for linked-to file. By
  194. default the textinfo is for the symlink itself.
  195. @param quick_determine_lang {boolean} can be set to True to have
  196. processing stop as soon as the language has been determined.
  197. Note that this means some fields will not be populated.
  198. @param env {runtime environment} A "runtime environment" class
  199. whose behaviour is used to influence processing. Currently
  200. it is just used to provide a hook for lang determination
  201. by filename (for Komodo).
  202. """
  203. if lidb is None:
  204. lidb = langinfo.get_default_database()
  205. self = cls()
  206. self.path = path
  207. self._accessor = PathAccessor(path, follow_symlinks=follow_symlinks)
  208. try:
  209. #TODO: pref: Is a preference specified for this path?
  210. self._classify_from_stat(lidb)
  211. if self.file_type_name != "regular file":
  212. # Don't continue if not a regular file.
  213. return self
  214. #TODO: add 'pref:treat_as_text' a la TextMate (or
  215. # perhaps that is handled in _classify_from_filename())
  216. self._classify_from_filename(lidb, env)
  217. if self.is_text is False:
  218. return self
  219. if self.lang and quick_determine_lang:
  220. return self
  221. if not self.lang:
  222. self._classify_from_magic(lidb)
  223. if self.is_text is False:
  224. return self
  225. if self.lang and quick_determine_lang:
  226. return self
  227. self._classify_encoding(lidb, suggested_encoding=encoding)
  228. if self.is_text is None and self.encoding:
  229. self.is_text = True
  230. if self.is_text is False:
  231. return self
  232. self.text = self._accessor.text
  233. if self.text: # No `self.text' with current UTF-32 hack.
  234. self._classify_from_content(lidb)
  235. return self
  236. finally:
  237. # Free the memory used by the accessor.
  238. del self._accessor
  239. def __repr__(self):
  240. if self.path:
  241. return "<TextInfo %r>" % self.path
  242. else:
  243. return "<TextInfo %r>"\
  244. % _one_line_summary_from_text(self.content, 30)
  245. def as_dict(self):
  246. return dict((k,v) for k,v in self.__dict__.items()
  247. if not k.startswith('_'))
  248. def as_summary(self):
  249. """One-liner string summary of text info."""
  250. d = self.as_dict()
  251. info = []
  252. if self.file_type_name and self.file_type_name != "regular file":
  253. info.append(self.file_type_name)
  254. else:
  255. info.append(self.lang or "???")
  256. if not self.is_text:
  257. info.append("binary")
  258. elif self.encoding:
  259. enc = self.encoding
  260. if self.has_bom:
  261. enc += " (bom)"
  262. info.append(enc)
  263. if DEBUG_CHARDET_INFO and hasattr(self, "chardet_info") \
  264. and self.chardet_info["encoding"]:
  265. info.append("chardet:%s/%.1f%%"
  266. % (self.chardet_info["encoding"],
  267. self.chardet_info["confidence"] * 100.0))
  268. return "%s: %s" % (self.path, ', '.join(info))
  269. def _classify_from_content(self, lidb):
  270. #TODO: Plan:
  271. # - eol_* attrs (test cases for this!)
  272. head = self.text[:self._accessor.HEAD_SIZE]
  273. tail = self.text[-self._accessor.TAIL_SIZE:]
  274. # If lang is unknown, attempt to guess from XML prolog or
  275. # shebang now that we've successfully decoded the buffer.
  276. if self.langinfo is None:
  277. (self.has_xml_prolog, xml_version,
  278. xml_encoding) = self._get_xml_prolog_info(head)
  279. if self.has_xml_prolog:
  280. self.xml_version = xml_version
  281. self.xml_encoding = xml_encoding
  282. self.langinfo = lidb.langinfo_from_lang("XML")
  283. self.lang = self.langinfo.name
  284. elif self.text.startswith("#!"):
  285. li = lidb.langinfo_from_magic(self.text, shebang_only=True)
  286. if li:
  287. self.langinfo = li
  288. self.lang = li.name
  289. # Extract Emacs local vars and Vi(m) modeline info and, if the
  290. # lang is still unknown, attempt to use them to determine it.
  291. self.emacs_vars = self._get_emacs_head_vars(head)
  292. self.emacs_vars.update(self._get_emacs_tail_vars(tail))
  293. self.vi_vars = self._get_vi_vars(head)
  294. if not self.vi_vars:
  295. self.vi_vars = self._get_vi_vars(tail)
  296. if self.langinfo is None and "mode" in self.emacs_vars:
  297. li = lidb.langinfo_from_emacs_mode(self.emacs_vars["mode"])
  298. if li:
  299. self.langinfo = li
  300. self.lang = li.name
  301. if self.langinfo is None and "filetype" in self.vi_vars \
  302. or "ft" in self.vi_vars:
  303. vi_filetype = self.vi_vars.get("filetype") or self.vi_vars.get("ft")
  304. li = lidb.langinfo_from_vi_filetype(vi_filetype)
  305. if li:
  306. self.langinfo = li
  307. self.lang = li.name
  308. if self.langinfo is not None:
  309. if self.langinfo.conforms_to("XML"):
  310. if not hasattr(self, "has_xml_prolog"):
  311. (self.has_xml_prolog, self.xml_version,
  312. self.xml_encoding) = self._get_xml_prolog_info(head)
  313. (self.has_doctype_decl, self.doctype_decl,
  314. self.doctype_name, self.doctype_public_id,
  315. self.doctype_system_id) = self._get_doctype_decl_info(head)
  316. # If this is just plain XML, we try to use the doctype
  317. # decl to choose a more specific XML lang.
  318. if self.lang == "XML" and self.has_doctype_decl:
  319. li = lidb.langinfo_from_doctype(
  320. public_id=self.doctype_public_id,
  321. system_id=self.doctype_system_id)
  322. if li and li.name != "XML":
  323. self.langinfo = li
  324. self.lang = li.name
  325. elif self.langinfo.conforms_to("HTML"):
  326. (self.has_doctype_decl, self.doctype_decl,
  327. self.doctype_name, self.doctype_public_id,
  328. self.doctype_system_id) = self._get_doctype_decl_info(head)
  329. # Allow promotion to XHTML (or other HTML flavours) based
  330. # on doctype.
  331. if self.lang == "HTML" and self.has_doctype_decl:
  332. li = lidb.langinfo_from_doctype(
  333. public_id=self.doctype_public_id,
  334. system_id=self.doctype_system_id)
  335. if li and li.name != "HTML":
  336. self.langinfo = li
  337. self.lang = li.name
  338. # Look for XML prolog and promote HTML -> XHTML if it
  339. # exists. Note that this wins over a plain HTML doctype.
  340. (self.has_xml_prolog, xml_version,
  341. xml_encoding) = self._get_xml_prolog_info(head)
  342. if self.has_xml_prolog:
  343. self.xml_version = xml_version
  344. self.xml_encoding = xml_encoding
  345. if self.lang == "HTML":
  346. li = lidb.langinfo_from_lang("XHTML")
  347. self.langinfo = li
  348. self.lang = li.name
  349. # Attempt to specialize the lang.
  350. if self.langinfo is not None:
  351. li = lidb.specialized_langinfo_from_content(self.langinfo, self.text)
  352. if li:
  353. self.langinfo = li
  354. self.lang = li.name
  355. def _classify_from_magic(self, lidb):
  356. """Attempt to classify from the file's magic number/shebang
  357. line, doctype, etc.
  358. Note that this is done before determining the encoding, so we are
  359. working with the *bytes*, not chars.
  360. """
  361. self.has_bom, bom, bom_encoding = self._get_bom_info()
  362. if self.has_bom:
  363. # If this file has a BOM then, unless something funny is
  364. # happening, this will be a text file encoded with
  365. # `bom_encoding`. We leave that to `_classify_encoding()`.
  366. return
  367. # Without a BOM we assume this is an 8-bit encoding, for the
  368. # purposes of looking at, e.g. a shebang line.
  369. #
  370. # UTF-16 and UTF-32 without a BOM is rare; we won't pick up on,
  371. # e.g. Python encoded as UCS-2 or UCS-4 here (but
  372. # `_classify_encoding()` should catch most of those cases).
  373. head_bytes = self._accessor.head_bytes
  374. li = lidb.langinfo_from_magic(head_bytes)
  375. if li:
  376. log.debug("lang from magic: %s", li.name)
  377. self.langinfo = li
  378. self.lang = li.name
  379. self.is_text = li.is_text
  380. return
  381. (has_doctype_decl, doctype_decl, doctype_name, doctype_public_id,
  382. doctype_system_id) = self._get_doctype_decl_info(head_bytes)
  383. if has_doctype_decl:
  384. li = lidb.langinfo_from_doctype(public_id=doctype_public_id,
  385. system_id=doctype_system_id)
  386. if li:
  387. log.debug("lang from doctype: %s", li.name)
  388. self.langinfo = li
  389. self.lang = li.name
  390. self.is_text = li.is_text
  391. return
  392. def _classify_encoding(self, lidb, suggested_encoding=None):
  393. """To classify from the content we need to separate text from
  394. binary, and figure out the encoding. This is an imperfect task.
  395. The algorithm here is to go through the following heroics to attempt
  396. to determine an encoding that works to decode the content. If all
  397. such attempts fail, we presume it is binary.
  398. 1. Use the BOM, if it has one.
  399. 2. Try the given suggested encoding (if any).
  400. 3. Check for EBCDIC encoding.
  401. 4. Lang-specific (if we know the lang already):
  402. * if this is Python, look for coding: decl and try that
  403. * if this is Perl, look for use encoding decl and try that
  404. * ...
  405. 5. XML: According to the XML spec the rule is the XML prolog
  406. specifies the encoding, or it is UTF-8.
  407. 6. HTML: Attempt to use Content-Type meta tag. Try the given
  408. charset, if any.
  409. 7. Emacs-style "coding" local var.
  410. 8. Vi[m]-style "fileencoding" local var.
  411. 9. Heuristic checks for UTF-16 without BOM.
  412. 10. Give UTF-8 a try, it is a pretty common fallback.
  413. We must do this before a possible 8-bit
  414. `locale.getpreferredencoding()` because any UTF-8 encoded
  415. document will decode with an 8-bit encoding (i.e. will decode,
  416. just with bogus characters).
  417. 11. Lang-specific fallback. E.g., UTF-8 for XML, ascii for Python.
  418. 12. chardet (http://chardet.feedparser.org/), if CHARDET_ENABLED == True
  419. 13. locale.getpreferredencoding()
  420. 14. iso8859-1 (in case `locale.getpreferredencoding()` is UTF-8
  421. we must have an 8-bit encoding attempt).
  422. TODO: Is there a worry for a lot of false-positives for
  423. binary files.
  424. Notes:
  425. - A la Universal Feed Parser, if some
  426. supposed-to-be-authoritative encoding indicator is wrong (e.g.
  427. the BOM, the Python 'coding:' decl for Python),
  428. `self.encoding_bozo` is set True and a reason is appended to
  429. the `self.encoding_bozo_reasons` list.
  430. """
  431. # 1. Try the BOM.
  432. if self.has_bom is not False: # Was set in `_classify_from_magic()`.
  433. self.has_bom, bom, bom_encoding = self._get_bom_info()
  434. if self.has_bom:
  435. self._accessor.strip_bom(bom)
  436. # Python doesn't currently include a UTF-32 codec. For now
  437. # we'll *presume* that a UTF-32 BOM is correct. The
  438. # limitation is that `self.text' will NOT get set
  439. # because we cannot decode it.
  440. if bom_encoding in ("utf-32-le", "utf-32-be") \
  441. or self._accessor.decode(bom_encoding):
  442. log.debug("encoding: encoding from BOM: %r", bom_encoding)
  443. self.encoding = bom_encoding
  444. return
  445. else:
  446. log.debug("encoding: BOM encoding (%r) was *wrong*",
  447. bom_encoding)
  448. self._encoding_bozo(
  449. u"BOM encoding (%s) could not decode %s"
  450. % (bom_encoding, self._accessor))
  451. head_bytes = self._accessor.head_bytes
  452. if DEBUG_CHARDET_INFO:
  453. sys.path.insert(0, os.path.expanduser("~/tm/check/contrib/chardet"))
  454. import chardet
  455. del sys.path[0]
  456. self.chardet_info = chardet.detect(head_bytes)
  457. # 2. Try the suggested encoding.
  458. if suggested_encoding is not None:
  459. norm_suggested_encoding = _norm_encoding(suggested_encoding)
  460. if self._accessor.decode(suggested_encoding):
  461. self.encoding = norm_suggested_encoding
  462. return
  463. else:
  464. log.debug("encoding: suggested %r encoding didn't work for %s",
  465. suggested_encoding, self._accessor)
  466. # 3. Check for EBCDIC.
  467. #TODO: Not sure this should be included, chardet may be better
  468. # at this given different kinds of EBCDIC.
  469. EBCDIC_MAGIC = '\x4c\x6f\xa7\x94'
  470. if self._accessor.head_4_bytes == EBCDIC_MAGIC:
  471. # This is EBCDIC, but I don't know if there are multiple kinds
  472. # of EBCDIC. Python has a 'ebcdic-cp-us' codec. We'll use
  473. # that for now.
  474. norm_ebcdic_encoding = _norm_encoding("ebcdic-cp-us")
  475. if self._accessor.decode(norm_ebcdic_encoding):
  476. log.debug("EBCDIC encoding: %r", norm_ebcdic_encoding)
  477. self.encoding = norm_ebcdic_encoding
  478. return
  479. else:
  480. log.debug("EBCDIC encoding didn't work for %s",
  481. self._accessor)
  482. # 4. Lang-specific (if we know the lang already).
  483. if self.langinfo and self.langinfo.conformant_attr("encoding_decl_pattern"):
  484. m = self.langinfo.conformant_attr("encoding_decl_pattern") \
  485. .search(head_bytes)
  486. if m:
  487. lang_encoding = m.group("encoding")
  488. norm_lang_encoding = _norm_encoding(lang_encoding)
  489. if self._accessor.decode(norm_lang_encoding):
  490. log.debug("encoding: encoding from lang-spec: %r",
  491. norm_lang_encoding)
  492. self.encoding = norm_lang_encoding
  493. return
  494. else:
  495. log.debug("encoding: lang-spec encoding (%r) was *wrong*",
  496. lang_encoding)
  497. self._encoding_bozo(
  498. u"lang-spec encoding (%s) could not decode %s"
  499. % (lang_encoding, self._accessor))
  500. # 5. XML prolog
  501. if self.langinfo and self.langinfo.conforms_to("XML"):
  502. has_xml_prolog, xml_version, xml_encoding \
  503. = self._get_xml_prolog_info(head_bytes)
  504. if xml_encoding is not None:
  505. norm_xml_encoding = _norm_encoding(xml_encoding)
  506. if self._accessor.decode(norm_xml_encoding):
  507. log.debug("encoding: encoding from XML prolog: %r",
  508. norm_xml_encoding)
  509. self.encoding = norm_xml_encoding
  510. return
  511. else:
  512. log.debug("encoding: XML prolog encoding (%r) was *wrong*",
  513. norm_xml_encoding)
  514. self._encoding_bozo(
  515. u"XML prolog encoding (%s) could not decode %s"
  516. % (norm_xml_encoding, self._accessor))
  517. # 6. HTML: Attempt to use Content-Type meta tag.
  518. if self.langinfo and self.langinfo.conforms_to("HTML"):
  519. has_http_content_type_info, http_content_type, http_encoding \
  520. = self._get_http_content_type_info(head_bytes)
  521. if has_http_content_type_info and http_encoding:
  522. norm_http_encoding = _norm_encoding(http_encoding)
  523. if self._accessor.decode(norm_http_encoding):
  524. log.debug("encoding: encoding from HTTP content-type: %r",
  525. norm_http_encoding)
  526. self.encoding = norm_http_encoding
  527. return
  528. else:
  529. log.debug("encoding: HTTP content-type encoding (%r) was *wrong*",
  530. norm_http_encoding)
  531. self._encoding_bozo(
  532. u"HTML content-type encoding (%s) could not decode %s"
  533. % (norm_http_encoding, self._accessor))
  534. # 7. Emacs-style local vars.
  535. emacs_head_vars = self._get_emacs_head_vars(head_bytes)
  536. emacs_encoding = emacs_head_vars.get("coding")
  537. if not emacs_encoding:
  538. tail_bytes = self._accessor.tail_bytes
  539. emacs_tail_vars = self._get_emacs_tail_vars(tail_bytes)
  540. emacs_encoding = emacs_tail_vars.get("coding")
  541. if emacs_encoding:
  542. norm_emacs_encoding = _norm_encoding(emacs_encoding)
  543. if self._accessor.decode(norm_emacs_encoding):
  544. log.debug("encoding: encoding from Emacs coding var: %r",
  545. norm_emacs_encoding)
  546. self.encoding = norm_emacs_encoding
  547. return
  548. else:
  549. log.debug("encoding: Emacs coding var (%r) was *wrong*",
  550. norm_emacs_encoding)
  551. self._encoding_bozo(
  552. u"Emacs coding var (%s) could not decode %s"
  553. % (norm_emacs_encoding, self._accessor))
  554. # 8. Vi[m]-style local vars.
  555. vi_vars = self._get_vi_vars(head_bytes)
  556. vi_encoding = vi_vars.get("fileencoding") or vi_vars.get("fenc")
  557. if not vi_encoding:
  558. vi_vars = self._get_vi_vars(self._accessor.tail_bytes)
  559. vi_encoding = vi_vars.get("fileencoding") or vi_vars.get("fenc")
  560. if vi_encoding:
  561. norm_vi_encoding = _norm_encoding(vi_encoding)
  562. if self._accessor.decode(norm_vi_encoding):
  563. log.debug("encoding: encoding from Vi[m] coding var: %r",
  564. norm_vi_encoding)
  565. self.encoding = norm_vi_encoding
  566. return
  567. else:
  568. log.debug("encoding: Vi[m] coding var (%r) was *wrong*",
  569. norm_vi_encoding)
  570. self._encoding_bozo(
  571. u"Vi[m] coding var (%s) could not decode %s"
  572. % (norm_vi_encoding, self._accessor))
  573. # 9. Heuristic checks for UTF-16 without BOM.
  574. utf16_encoding = None
  575. head_odd_bytes = head_bytes[0::2]
  576. head_even_bytes = head_bytes[1::2]
  577. head_markers = ["<?xml", "#!"]
  578. for head_marker in head_markers:
  579. length = len(head_marker)
  580. if head_odd_bytes.startswith(head_marker) \
  581. and head_even_bytes[0:length] == '\x00'*length:
  582. utf16_encoding = "utf-16-le"
  583. break
  584. elif head_even_bytes.startswith(head_marker) \
  585. and head_odd_bytes[0:length] == '\x00'*length:
  586. utf16_encoding = "utf-16-be"
  587. break
  588. internal_markers = ["coding"]
  589. for internal_marker in internal_markers:
  590. length = len(internal_marker)
  591. try:
  592. idx = head_odd_bytes.index(internal_marker)
  593. except ValueError:
  594. pass
  595. else:
  596. if head_even_bytes[idx:idx+length] == '\x00'*length:
  597. utf16_encoding = "utf-16-le"
  598. try:
  599. idx = head_even_bytes.index(internal_marker)
  600. except ValueError:
  601. pass
  602. else:
  603. if head_odd_bytes[idx:idx+length] == '\x00'*length:
  604. utf16_encoding = "utf-16-be"
  605. if utf16_encoding:
  606. if self._accessor.decode(utf16_encoding):
  607. log.debug("encoding: guessed encoding: %r", utf16_encoding)
  608. self.encoding = utf16_encoding
  609. return
  610. # 10. Give UTF-8 a try.
  611. norm_utf8_encoding = _norm_encoding("utf-8")
  612. if self._accessor.decode(norm_utf8_encoding):
  613. log.debug("UTF-8 encoding: %r", norm_utf8_encoding)
  614. self.encoding = norm_utf8_encoding
  615. return
  616. # 11. Lang-specific fallback (e.g. XML -> utf-8, Python -> ascii, ...).
  617. # Note: A potential problem here is that a fallback encoding here that
  618. # is a pre-Unicode Single-Byte encoding (like iso8859-1) always "works"
  619. # so the subsequent heuristics never get tried.
  620. fallback_encoding = None
  621. fallback_lang = None
  622. if self.langinfo:
  623. fallback_lang = self.langinfo.name
  624. fallback_encoding = self.langinfo.conformant_attr("default_encoding")
  625. if fallback_encoding:
  626. if self._accessor.decode(fallback_encoding):
  627. log.debug("encoding: fallback encoding for %s: %r",
  628. fallback_lang, fallback_encoding)
  629. self.encoding = fallback_encoding
  630. return
  631. else:
  632. log.debug("encoding: %s fallback encoding (%r) was *wrong*",
  633. fallback_lang, fallback_encoding)
  634. self._encoding_bozo(
  635. u"%s fallback encoding (%s) could not decode %s"
  636. % (fallback_lang, fallback_encoding, self._accessor))
  637. # 12. chardet (http://chardet.feedparser.org/)
  638. # Note: I'm leary of using this b/c (a) it's a sizeable perf
  639. # hit and (b) false positives -- for example, the first 8kB of
  640. # /usr/bin/php on Mac OS X 10.4.10 is ISO-8859-2 with 44%
  641. # confidence. :)
  642. # Solution: (a) Only allow for content we know is not binary
  643. # (from langinfo association); and (b) can be disabled via
  644. # CHARDET_ENABLED class attribute.
  645. if self.CHARDET_ENABLED and self.langinfo and self.langinfo.is_text:
  646. try:
  647. import chardet
  648. except ImportError:
  649. warnings.warn("no chardet module to aid in guessing encoding",
  650. ChardetImportWarning)
  651. else:
  652. chardet_info = chardet.detect(head_bytes)
  653. if chardet_info["encoding"] \
  654. and chardet_info["confidence"] > self.CHARDET_THRESHHOLD:
  655. chardet_encoding = chardet_info["encoding"]
  656. norm_chardet_encoding = _norm_encoding(chardet_encoding)
  657. if self._accessor.decode(norm_chardet_encoding):
  658. log.debug("chardet encoding: %r", chardet_encoding)
  659. self.encoding = norm_chardet_encoding
  660. return
  661. # 13. locale.getpreferredencoding()
  662. # Typical values for this:
  663. # Windows: cp1252 (aka windows-1252)
  664. # Mac OS X: mac-roman
  665. # Linux: UTF-8 (modern Linux anyway)
  666. # Solaris 8: 464 (aka ASCII)
  667. locale_encoding = locale.getpreferredencoding()
  668. if locale_encoding:
  669. norm_locale_encoding = _norm_encoding(locale_encoding)
  670. if self._accessor.decode(norm_locale_encoding):
  671. log.debug("encoding: locale preferred encoding: %r",
  672. locale_encoding)
  673. self.encoding = norm_locale_encoding
  674. return
  675. # 14. iso8859-1
  676. norm_fallback8bit_encoding = _norm_encoding("iso8859-1")
  677. if self._accessor.decode(norm_fallback8bit_encoding):
  678. log.debug("fallback 8-bit encoding: %r", norm_fallback8bit_encoding)
  679. self.encoding = norm_fallback8bit_encoding
  680. return
  681. # We couldn't find an encoding that works. Give up and presume
  682. # this is binary content.
  683. self.is_text = False
  684. def _encoding_bozo(self, reason):
  685. self.encoding_bozo = True
  686. if self.encoding_bozo_reasons is None:
  687. self.encoding_bozo_reasons = []
  688. self.encoding_bozo_reasons.append(reason)
  689. # c.f. http://www.xml.com/axml/target.html#NT-prolog
  690. _xml_prolog_pat = re.compile(
  691. r'''<\?xml
  692. ( # strict ordering is reqd but we'll be liberal here
  693. \s+version=['"](?P<ver>.*?)['"]
  694. | \s+encoding=['"](?P<enc>.*?)['"]
  695. )+
  696. .*? # other possible junk
  697. \s*\?>
  698. ''',
  699. re.VERBOSE | re.DOTALL
  700. )
  701. def _get_xml_prolog_info(self, head_bytes):
  702. """Parse out info from the '<?xml version=...' prolog, if any.
  703. Returns (<has-xml-prolog>, <xml-version>, <xml-encoding>). Examples:
  704. (False, None, None)
  705. (True, "1.0", None)
  706. (True, "1.0", "UTF-16")
  707. """
  708. # Presuming an 8-bit encoding. If it is UTF-16 or UTF-32, then
  709. # that should have been picked up by an earlier BOM check or via
  710. # the subsequent heuristic check for UTF-16 without a BOM.
  711. if not head_bytes.startswith("<?xml"):
  712. return (False, None, None)
  713. # Try to extract more info from the prolog.
  714. match = self._xml_prolog_pat.match(head_bytes)
  715. if not match:
  716. if log.isEnabledFor(logging.DEBUG):
  717. log.debug("`%s': could not match XML prolog: '%s'", self.path,
  718. _one_line_summary_from_text(head_bytes, 40))
  719. return (False, None, None)
  720. xml_version = match.group("ver")
  721. xml_encoding = match.group("enc")
  722. return (True, xml_version, xml_encoding)
  723. _html_meta_tag_pat = re.compile("""
  724. (<meta
  725. (?:\s+[\w-]+\s*=\s*(?:".*?"|'.*?'))+ # attributes
  726. \s*/?>)
  727. """,
  728. re.IGNORECASE | re.VERBOSE
  729. )
  730. _html_attr_pat = re.compile(
  731. # Currently requiring XML attrs (i.e. quoted value).
  732. '''(?:\s+([\w-]+)\s*=\s*(".*?"|'.*?'))'''
  733. )
  734. _http_content_type_splitter = re.compile(";\s*")
  735. def _get_http_content_type_info(self, head_bytes):
  736. """Returns info extracted from an HTML content-type meta tag if any.
  737. Returns (<has-http-content-type-info>, <content-type>, <charset>).
  738. For example:
  739. <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  740. yields:
  741. (True, "text/html", "utf-8")
  742. """
  743. # Presuming an 8-bit encoding. If it is UTF-16 or UTF-32, then
  744. # that should have been picked up by an earlier BOM check.
  745. # Otherwise we rely on `chardet` to cover us.
  746. # Parse out '<meta ...>' tags, then the attributes in them.
  747. for meta_tag in self._html_meta_tag_pat.findall(head_bytes):
  748. meta = dict( (k.lower(), v[1:-1])
  749. for k,v in self._html_attr_pat.findall(meta_tag))
  750. if "http-equiv" in meta \
  751. and meta["http-equiv"].lower() == "content-type":
  752. content = meta.get("content", "")
  753. break
  754. else:
  755. return (False, None, None)
  756. # We found a http-equiv="Content-Type" tag, parse its content
  757. # attribute value.
  758. parts = [p.strip() for p in self._http_content_type_splitter.split(content)]
  759. if not parts:
  760. return (False, None, None)
  761. content_type = parts[0] or None
  762. for p in parts[1:]:
  763. if p.lower().startswith("charset="):
  764. charset = p[len("charset="):]
  765. if charset and charset[0] in ('"', "'"):
  766. charset = charset[1:]
  767. if charset and charset[-1] in ('"', "'"):
  768. charset = charset[:-1]
  769. break
  770. else:
  771. charset = None
  772. return (True, content_type, charset)
  773. #TODO: Note that this isn't going to catch the current HTML 5
  774. # doctype: '<!DOCTYPE html>'
  775. _doctype_decl_re = re.compile(r'''
  776. <!DOCTYPE
  777. \s+(?P<name>[a-zA-Z_:][\w:.-]*)
  778. \s+(?:
  779. SYSTEM\s+(["'])(?P<system_id_a>.*?)\2
  780. |
  781. PUBLIC
  782. \s+(["'])(?P<public_id_b>.*?)\4
  783. # HTML 3.2 and 2.0 doctypes don't include a system-id.
  784. (?:\s+(["'])(?P<system_id_b>.*?)\6)?
  785. )
  786. (\s*\[.*?\])?
  787. \s*>
  788. ''', re.IGNORECASE | re.DOTALL | re.UNICODE | re.VERBOSE)
  789. def _get_doctype_decl_info(self, head):
  790. """Parse out DOCTYPE info from the given XML or HTML content.
  791. Returns a tuple of the form:
  792. (<has-doctype-decl>, <doctype-decl>,
  793. <name>, <public-id>, <system-id>)
  794. The <public-id> is normalized as per this comment in the XML 1.0
  795. spec:
  796. Before a match is attempted, all strings of white space in the
  797. public identifier must be normalized to single space
  798. characters (#x20), and leading and trailing white space must
  799. be removed.
  800. Examples:
  801. (False, None, None, None, None)
  802. (True, '<!DOCTYPE greeting SYSTEM "hello.dtd">',
  803. 'greeting', None, 'hello.dtd'),
  804. (True,
  805. '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">',
  806. 'html',
  807. '-//W3C//DTD XHTML 1.0 Transitional//EN',
  808. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd')
  809. Here is the spec for DOCTYPE decls in XML:
  810. http://www.xml.com/axml/target.html#NT-doctypedecl
  811. We loosely follow this to allow for some decls in HTML that isn't
  812. proper XML. As well, we are only parsing out decls that reference
  813. an external ID, as opposed to those that define entities locally.
  814. """
  815. if "<!DOCTYPE" not in head: # quick out
  816. return (False, None, None, None, None)
  817. m = self._doctype_decl_re.search(head)
  818. if not m:
  819. return (False, None, None, None, None)
  820. d = m.groupdict()
  821. name = d.get("name")
  822. system_id = d.get("system_id_a") or d.get("system_id_b")
  823. public_id = d.get("public_id_b")
  824. if public_id:
  825. public_id = re.sub("\s+", ' ', public_id.strip()) # normalize
  826. return (True, m.group(0), name, public_id, system_id)
  827. _emacs_vars_head_pat = re.compile("-\*-\s*(.*?)\s*-\*-")
  828. _emacs_head_vars_cache = None
  829. def _get_emacs_head_vars(self, head_bytes):
  830. """Return a dictionary of emacs-style local variables in the head.
  831. "Head" emacs vars on the ones in the '-*- ... -*-' one-liner.
  832. Parsing is done loosely according to this spec (and according to
  833. some in-practice deviations from this):
  834. http://www.gnu.org/software/emacs/manual/html_node/emacs/Specifying-File-Variables.html#Specifying-File-Variables
  835. """
  836. # Presuming an 8-bit encoding. If it is UTF-16 or UTF-32, then
  837. # that should have been picked up by an earlier BOM check.
  838. # Otherwise we rely on `chardet` to cover us.
  839. if self._emacs_head_vars_cache is not None:
  840. return self._emacs_head_vars_cache
  841. # Search the head for a '-*-'-style one-liner of variables.
  842. emacs_vars = {}
  843. if "-*-" in head_bytes:
  844. match = self._emacs_vars_head_pat.search(head_bytes)
  845. if match:
  846. emacs_vars_str = match.group(1)
  847. if '\n' in emacs_vars_str:
  848. raise ValueError("local variables error: -*- not "
  849. "terminated before end of line")
  850. emacs_var_strs = [s.strip() for s in emacs_vars_str.split(';')
  851. if s.strip()]
  852. if len(emacs_var_strs) == 1 and ':' not in emacs_var_strs[0]:
  853. # While not in the spec, this form is allowed by emacs:
  854. # -*- Tcl -*-
  855. # where the implied "variable" is "mode". This form
  856. # is only allowed if there are no other variables.
  857. emacs_vars["mode"] = emacs_var_strs[0].strip()
  858. else:
  859. for emacs_var_str in emacs_var_strs:
  860. try:
  861. variable, value = emacs_var_str.strip().split(':', 1)
  862. except ValueError:
  863. log.debug("emacs variables error: malformed -*- "
  864. "line: %r", emacs_var_str)
  865. continue
  866. # Lowercase the variable name because Emacs allows "Mode"
  867. # or "mode" or "MoDe", etc.
  868. emacs_vars[variable.lower()] = value.strip()
  869. # Unquote values.
  870. for var, val in emacs_vars.items():
  871. if len(val) > 1 and (val.startswith('"') and val.endswith('"')
  872. or val.startswith('"') and val.endswith('"')):
  873. emacs_vars[var] = val[1:-1]
  874. self._emacs_head_vars_cache = emacs_vars
  875. return emacs_vars
  876. # This regular expression is intended to match blocks like this:
  877. # PREFIX Local Variables: SUFFIX
  878. # PREFIX mode: Tcl SUFFIX
  879. # PREFIX End: SUFFIX
  880. # Some notes:
  881. # - "[ \t]" is used instead of "\s" to specifically exclude newlines
  882. # - "(\r\n|\n|\r)" is used instead of "$" because the sre engine does
  883. # not like anything other than Unix-style line terminators.
  884. _emacs_vars_tail_pat = re.compile(r"""^
  885. (?P<prefix>(?:[^\r\n|\n|\r])*?)
  886. [\ \t]*Local\ Variables:[\ \t]*
  887. (?P<suffix>.*?)(?:\r\n|\n|\r)
  888. (?P<content>.*?\1End:)
  889. """, re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE)
  890. _emacs_tail_vars_cache = None
  891. def _get_emacs_tail_vars(self, tail_bytes):
  892. r"""Return a dictionary of emacs-style local variables in the tail.
  893. "Tail" emacs vars on the ones in the multi-line "Local
  894. Variables:" block.
  895. >>> TextInfo()._get_emacs_tail_vars('# Local Variables:\n# foo: bar\n# End:')
  896. {'foo': 'bar'}
  897. >>> TextInfo()._get_emacs_tail_vars('# Local Variables:\n# foo: bar\\\n# baz\n# End:')
  898. {'foo': 'bar baz'}
  899. >>> TextInfo()._get_emacs_tail_vars('# Local Variables:\n# quoted: "bar "\n# End:')
  900. {'quoted': 'bar '}
  901. Parsing is done according to this spec (and according to some
  902. in-practice deviations from this):
  903. http://www.gnu.org/software/emacs/manual/html_chapter/emacs_33.html#SEC485
  904. """
  905. # Presuming an 8-bit encoding. If it is UTF-16 or UTF-32, then
  906. # that should have been picked up by an earlier BOM check.
  907. # Otherwise we rely on `chardet` to cover us.
  908. if self._emacs_tail_vars_cache is not None:
  909. return self._emacs_tail_vars_cache
  910. emacs_vars = {}
  911. if "Local Variables" not in tail_bytes:
  912. self._emacs_tail_vars_cache = emacs_vars
  913. return emacs_vars
  914. match = self._emacs_vars_tail_pat.search(tail_bytes)
  915. if match:
  916. prefix = match.group("prefix")
  917. suffix = match.group("suffix")
  918. lines = match.group("content").splitlines(0)
  919. #print "prefix=%r, suffix=%r, content=%r, lines: %s"\
  920. # % (prefix, suffix, match.group("content"), lines)
  921. # Validate the Local Variables block: proper prefix and suffix
  922. # usage.
  923. for i, line in enumerate(lines):
  924. if not line.startswith(prefix):
  925. log.debug("emacs variables error: line '%s' "
  926. "does not use proper prefix '%s'"
  927. % (line, prefix))
  928. return {}
  929. # Don't validate suffix on last line. Emacs doesn't care,
  930. # neither should we.
  931. if i != len(lines)-1 and not line.endswith(suffix):
  932. log.debug("emacs variables error: line '%s' "
  933. "does not use proper suffix '%s'"
  934. % (line, suffix))
  935. return {}
  936. # Parse out one emacs var per line.
  937. continued_for = None
  938. for line in lines[:-1]: # no var on the last line ("PREFIX End:")
  939. if prefix: line = line[len(prefix):] # strip prefix
  940. if suffix: line = line[:-len(suffix)] # strip suffix
  941. line = line.strip()
  942. if continued_for:
  943. variable = continued_for
  944. if line.endswith('\\'):
  945. line = line[:-1].rstrip()
  946. else:
  947. continued_for = None
  948. emacs_vars[variable] += ' ' + line
  949. else:
  950. try:
  951. variable, value = line.split(':', 1)
  952. except ValueError:
  953. log.debug("local variables error: missing colon "
  954. "in local variables entry: '%s'" % line)
  955. continue
  956. # Do NOT lowercase the variable name, because Emacs only
  957. # allows "mode" (and not "Mode", "MoDe", etc.) in this block.
  958. value = value.strip()
  959. if value.endswith('\\'):
  960. value = value[:-1].rstrip()
  961. continued_for = variable
  962. else:
  963. continued_for = None
  964. emacs_vars[variable] = value
  965. # Unquote values.
  966. for var, val in emacs_vars.items():
  967. if len(val) > 1 and (val.startswith('"') and val.endswith('"')
  968. or val.startswith('"') and val.endswith('"')):
  969. emacs_vars[var] = val[1:-1]
  970. self._emacs_tail_vars_cache = emacs_vars
  971. return emacs_vars
  972. # Note: It might nice if parser also gave which of 'vi, vim, ex' and
  973. # the range in the accessor.
  974. _vi_vars_pats_and_splitters = [
  975. (re.compile(r'[ \t]+(vi|vim([<>=]?\d{3})?|ex):\s*set? (?P<rhs>.*?)(?<!\\):', re.M),
  976. re.compile(r'[ \t]+')),
  977. (re.compile(r'[ \t]+(vi|vim([<>=]?\d{3})?|ex):\s*(?P<rhs>.*?)$', re.M),
  978. re.compile(r'[ \t:]+')),
  979. (re.compile(r'^(vi|vim([<>=]?\d{3})?):\s*set? (?P<rhs>.*?)(?<!\\):', re.M),
  980. re.compile(r'[ \t]+')),
  981. ]
  982. _vi_vars_cache = None
  983. def _get_vi_vars(self, bytes):
  984. r"""Return a dict of Vi[m] modeline vars.
  985. See ":help modeline" in Vim for a spec.
  986. >>> TextInfo()._get_vi_vars("/* vim: set ai tw=75: */")
  987. {'ai': None, 'tw': 75}
  988. >>> TextInfo()._get_vi_vars("vim: set ai tw=75: bar")
  989. {'ai': None, 'tw': 75}
  990. >>> TextInfo()._get_vi_vars("vi: set foo:bar")
  991. {'foo': None}
  992. >>> TextInfo()._get_vi_vars(" vi: se foo:bar")
  993. {'foo': None}
  994. >>> TextInfo()._get_vi_vars(" ex: se foo:bar")
  995. {'foo': None}
  996. >>> TextInfo()._get_vi_vars(" vi:noai:sw=3 tw=75")
  997. {'tw': 75, 'sw': 3, 'noai': None}
  998. >>> TextInfo()._get_vi_vars(" vi:noai:sw=3 tw=75")
  999. {'tw': 75, 'sw': 3, 'noai': None}
  1000. >>> TextInfo()._get_vi_vars("ex: se foo:bar")
  1001. {}
  1002. Some edge cases:
  1003. >>> TextInfo()._get_vi_vars(r"/* vi:set dir=c\:\tmp: */")
  1004. {'dir': 'c:\\tmp'}
  1005. """
  1006. # Presume 8-bit encoding... yada yada.
  1007. if self._vi_vars_cache is not None:
  1008. return self._vi_vars_cache
  1009. vi_vars = {}
  1010. #TODO: Consider reducing support to just "vi:" for speed. This
  1011. # function takes way too much time.
  1012. if "vi:" not in bytes and "ex:" not in bytes and "vim:" not in bytes:
  1013. self._vi_vars_cache = vi_vars
  1014. return vi_vars
  1015. for pat, splitter in self._vi_vars_pats_and_splitters:
  1016. match = pat.search(bytes)
  1017. if match:
  1018. for var_str in splitter.split(match.group("rhs")):
  1019. if '=' in var_str:
  1020. name, value = var_str.split('=', 1)
  1021. try:
  1022. vi_vars[name] = int(value)
  1023. except ValueError:
  1024. vi_vars[name] = value.replace('\\:', ':')
  1025. else:
  1026. vi_vars[var_str] = None
  1027. break
  1028. self._vi_vars_cache = vi_vars
  1029. return vi_vars
  1030. def _get_bom_info(self):
  1031. r"""Returns (<has-bom>, <bom>, <bom-encoding>). Examples:
  1032. (True, '\xef\xbb\xbf', "utf-8")
  1033. (True, '\xff\xfe', "utf-16-le")
  1034. (False, None, None)
  1035. """
  1036. boms_and_encodings = [ # in order from longest to shortest
  1037. (codecs.BOM_UTF32_LE, "utf-32-le"),
  1038. (codecs.BOM_UTF32_BE, "utf-32-be"),
  1039. (codecs.BOM_UTF8, "utf-8"),
  1040. (codecs.BOM_UTF16_LE, "utf-16-le"),
  1041. (codecs.BOM_UTF16_BE, "utf-16-be"),
  1042. ]
  1043. head_4 = self._accessor.head_4_bytes
  1044. for bom, encoding in boms_and_encodings:
  1045. if head_4.startswith(bom):
  1046. return (True, bom, encoding)
  1047. break
  1048. else:
  1049. return (False, None, None)
  1050. def _classify_from_filename(self, lidb, env):
  1051. """Classify from the path *filename* only.
  1052. Sets `lang' and `langinfo', if can be determined.
  1053. """
  1054. filename = basename(self.path)
  1055. if env is not None:
  1056. li = env.langinfo_from_filename(filename)
  1057. if li:
  1058. log.debug("lang from env: `%s' -> `%s'", filename, li.name)
  1059. self.langinfo = li
  1060. self.lang = li.name
  1061. self.is_text = li.is_text
  1062. return
  1063. # ...from the ext
  1064. idx = 0
  1065. while True:
  1066. idx = filename.find('.', idx)
  1067. if idx == -1:
  1068. break
  1069. ext = filename[idx:]
  1070. li = lidb.langinfo_from_ext(ext)
  1071. if li:
  1072. log.debug("lang from ext: `%s' -> `%s'", ext, li.name)
  1073. self.langinfo = li
  1074. self.lang = li.name
  1075. self.is_text = li.is_text
  1076. return
  1077. idx += 1
  1078. # ...from file basename
  1079. li = lidb.langinfo_from_filename(filename)
  1080. if li:
  1081. log.debug("lang from filename: `%s' -> `%s'", filename, li.name)
  1082. self.langinfo = li
  1083. self.lang = li.name
  1084. self.is_text = li.is_text
  1085. return
  1086. def _classify_from_stat(self, lidb):
  1087. """Set some `file_*' attributes from stat mode."""
  1088. from stat import S_ISREG, S_ISDIR, S_ISLNK, S_ISFIFO, S_ISSOCK, \
  1089. S_ISBLK, S_ISCHR, S_IMODE, S_IFMT
  1090. stat = self._accessor.stat
  1091. st_mode = stat.st_mode
  1092. self.file_type = S_IFMT(st_mode)
  1093. self.file_mode = S_IMODE(st_mode)
  1094. self.file_stat = stat
  1095. if S_ISREG(st_mode):
  1096. self.file_type_name = "regular file"
  1097. elif S_ISDIR(st_mode):
  1098. self.file_type_name = "directory"
  1099. elif S_ISLNK(st_mode):
  1100. self.file_type_name = "symbolic link"
  1101. elif S_ISFIFO(st_mode):
  1102. self.file_type_name = "fifo"
  1103. elif S_ISSOCK(st_mode):
  1104. self.file_type_name = "socket"
  1105. elif S_ISBLK(st_mode):
  1106. self.file_type_name = "block special"
  1107. elif S_ISCHR(st_mode):
  1108. self.file_type_name = "character special"
  1109. def _norm_encoding(encoding):
  1110. """Normalize the encoding name -- where "normalized" is what
  1111. Python's codec's module calls it.
  1112. Interesting link:
  1113. The IANA-registered set of character sets.
  1114. http://www.iana.org/assignments/character-sets
  1115. """
  1116. try:
  1117. # This requires Python >=2.5.
  1118. return codecs.lookup(encoding).name
  1119. except LookupError:
  1120. return encoding
  1121. #---- accessor API
  1122. # The idea here is to abstract accessing the text file content being
  1123. # classified to allow, e.g. classifying content without a file, from
  1124. # a Komodo buffer, etc.
  1125. class Accessor(object):
  1126. """Virtual base class defining Accessor API for accessing
  1127. text content.
  1128. """
  1129. # API:
  1130. # prop head_bytes -> head 8k bytes
  1131. # prop head_4_bytes -> head 4 bytes (useful for BOM detection)
  1132. # prop tail_bytes -> tail 8k bytes
  1133. # def bytes_range(start, end) -> bytes in that range
  1134. HEAD_SIZE = pow(2, 13) # 8k
  1135. TAIL_SIZE = pow(2, 13) # 8k
  1136. encoding = None
  1137. text = None
  1138. _unsuccessful_encodings = None
  1139. def decode(self, encoding):
  1140. """Decodes bytes with the given encoding and, if successful,
  1141. sets `self.text` with the decoded result and returns True.
  1142. Otherwise, returns False.
  1143. Side-effects: On success, sets `self.text` and `self.encoding`.
  1144. Optimization: First an attempt is made to decode
  1145. `self.head_bytes` instead of all of `self.bytes`. This allows
  1146. for the normal usage in `TextInfo._classify_encoding()` to *not*
  1147. bother fully reading binary files that could not be decoded.
  1148. Optimization: Decoding attempts are cached to not bother
  1149. attempting a failed decode twice.
  1150. """
  1151. if self._unsuccessful_encodings is None:
  1152. self._unsuccessful_encodings = set()
  1153. if encoding in self._unsuccessful_encodings:
  1154. return False
  1155. elif encoding == self.encoding:
  1156. return True
  1157. head_bytes = self.head_bytes
  1158. try:
  1159. head_bytes.decode(encoding, 'strict')
  1160. except LookupError, ex:
  1161. log.debug("encoding lookup error: %r", encoding)
  1162. self._unsuccessful_encodings.add(encoding)
  1163. return False
  1164. except UnicodeError, ex:
  1165. # If the decode failed in the last few bytes, it might be
  1166. # because a multi-surrogate was cutoff by the head. Ignore
  1167. # the error here, if it is truly not of this encoding, the
  1168. # full file decode will fail.
  1169. if ex.start >= self.HEAD_SIZE - 5:
  1170. # '5' because the max num bytes to encode a single char
  1171. # in any encoding is 6 bytes (in UTF-8).
  1172. pass
  1173. else:
  1174. self._unsuccessful_encodings.add(encoding)
  1175. return False
  1176. try:
  1177. self.text = self.bytes.decode(encoding, 'strict')
  1178. except UnicodeError, ex:
  1179. self._unsuccessful_encodings.add(encoding)
  1180. return False
  1181. self.encoding = encoding
  1182. return True
  1183. class PathAccessor(Accessor):
  1184. """Accessor API for a path."""
  1185. (READ_NONE, # _file==None, file not opened yet
  1186. READ_HEAD, # _bytes==<head bytes>
  1187. READ_TAIL, # _bytes==<head>, _bytes_tail==<tail>
  1188. READ_ALL) = range(4) # _bytes==<all>, _bytes_tail==None, _file closed
  1189. _read_state = READ_NONE # one of the READ_* states
  1190. _file = None
  1191. _bytes = None
  1192. _bytes_tail = None
  1193. def __init__(self, path, follow_symlinks=False):
  1194. self.path = path
  1195. self.follow_symlinks = follow_symlinks
  1196. def __str__(self):
  1197. return "path `%s'" % self.path
  1198. _stat_cache = None
  1199. @property
  1200. def stat(self):
  1201. if self._stat_cache is None:
  1202. if self.follow_symlinks:
  1203. self._stat_cache = os.stat(self.path)
  1204. else:
  1205. self._stat_cache = os.lstat(self.path)
  1206. return self._stat_cache
  1207. @property
  1208. def size(self):
  1209. return self.stat.st_size
  1210. def __del__(self):
  1211. self.close()
  1212. def close(self):
  1213. if self._file and not self._file.closed:
  1214. self._file.close()
  1215. def _read(self, state):
  1216. """Read up to at least `state`."""
  1217. #TODO: If `follow_symlinks` is False and this is a symlink we
  1218. # must use os.readlink() here.
  1219. # It is the job of the caller to only call _read() if necessary.
  1220. assert self._read_state < state
  1221. try:
  1222. if self._read_state == self.READ_NONE:
  1223. assert self._file is None and self._bytes is None
  1224. self._file = open(self.path, 'rb')
  1225. if state == self.READ_HEAD:
  1226. self._bytes = self._file.read(self.HEAD_SIZE)
  1227. self._read_state = (self.size <= self.HEAD_SIZE
  1228. and self.READ_ALL or self.READ_HEAD)
  1229. elif state == self.READ_TAIL:
  1230. if self.size <= self.HEAD_SIZE + self.TAIL_SIZE:
  1231. self._bytes = self._file.read()
  1232. self._read_state = self.READ_ALL
  1233. else:
  1234. self._bytes = self._file.read(self.HEAD_SIZE)
  1235. self._file.seek(-self.TAIL_SIZE, 2) # 2 == relative to end
  1236. self._bytes_tail = self._file.read(self.TAIL_SIZE)
  1237. self._read_state = self.READ_TAIL
  1238. elif state == self.READ_ALL:
  1239. self._bytes = self._file.read()
  1240. self._read_state = self.READ_ALL
  1241. elif self._read_state == self.READ_HEAD:
  1242. if state == self.READ_TAIL:
  1243. if self.size <= self.HEAD_SIZE + self.TAIL_SIZE:
  1244. self._bytes += self._file.read()
  1245. self._read_state = self.READ_ALL
  1246. else:
  1247. self._file.seek(-self.TAIL_SIZE, 2) # 2 == relative to end
  1248. self._bytes_tail = self._file.read(self.TAIL_SIZE)
  1249. self._read_state = self.READ_TAIL
  1250. elif state == self.READ_ALL:
  1251. self._bytes += self._file.read()
  1252. self._read_state = self.READ_ALL
  1253. elif self._read_state == self.READ_TAIL:
  1254. assert state == self.READ_ALL
  1255. self._file.seek(self.HEAD_SIZE, 0) # 0 == relative to start
  1256. remaining_size = self.size - self.HEAD_SIZE - self.TAIL_SIZE
  1257. assert remaining_size > 0, \
  1258. "negative remaining bytes to read from '%s': %d" \
  1259. % (self.path, self.size)
  1260. self._bytes += self._file.read(remaining_size)
  1261. self._bytes += self._bytes_tail
  1262. self._bytes_tail = None
  1263. self._read_state = self.READ_ALL
  1264. if self._read_state == self.READ_ALL:
  1265. self.close()
  1266. except Exception, ex:
  1267. log.warn("Could not read file: %r due to: %r", self.path, ex)
  1268. raise
  1269. def strip_bom(self, bom):
  1270. """This should be called by the user of this class to strip a
  1271. detected BOM from the bytes for subsequent decoding and
  1272. analysis.
  1273. """
  1274. assert self._bytes[:len(bom)] == bom
  1275. self._bytes = self._bytes[len(bom):]
  1276. @property
  1277. def head_bytes(self):
  1278. """The first 8k raw bytes of the document."""
  1279. if self._read_state < self.READ_HEAD:
  1280. self._read(self.READ_HEAD)
  1281. return self._bytes[:self.HEAD_SIZE]
  1282. @property
  1283. def head_4_bytes(self):
  1284. if self._read_state < self.READ_HEAD:
  1285. self._read(self.READ_HEAD)
  1286. return self._bytes[:4]
  1287. @property
  1288. def tail_bytes(self):
  1289. if self._read_state < self.READ_TAIL:
  1290. self._read(self.READ_TAIL)
  1291. if self._read_state == self.READ_ALL:
  1292. return self._bytes[-self.TAIL_SIZE:]
  1293. else:
  1294. return self._bytes_tail
  1295. def bytes_range(self, start, end):
  1296. if self._read_state < self.READ_ALL:
  1297. self._read(self.READ_ALL)
  1298. return self._bytes[start:end]
  1299. @property
  1300. def bytes(self):
  1301. if self._read_state < self.READ_ALL:
  1302. self._read(self.READ_ALL)
  1303. return self._bytes
  1304. #---- internal support stuff
  1305. # Recipe: regex_from_encoded_pattern (1.0)
  1306. def _regex_from_encoded_pattern(s):
  1307. """'foo' -> re.compile(re.escape('foo'))
  1308. '/foo/' -> re.compile('foo')
  1309. '/foo/i' -> re.compile('foo', re.I)
  1310. """
  1311. if s.startswith('/') and s.rfind('/') != 0:
  1312. # Parse it: /PATTERN/FLAGS
  1313. idx = s.rfind('/')
  1314. pattern, flags_str = s[1:idx], s[idx+1:]
  1315. flag_from_char = {
  1316. "i": re.IGNORECASE,
  1317. "l": re.LOCALE,
  1318. "s": re.DOTALL,
  1319. "m": re.MULTILINE,
  1320. "u": re.UNICODE,
  1321. }
  1322. flags = 0
  1323. for char in flags_str:
  1324. try:
  1325. flags |= flag_from_char[char]
  1326. except KeyError:
  1327. raise ValueError("unsupported regex flag: '%s' in '%s' "
  1328. "(must be one of '%s')"
  1329. % (char, s, ''.join(flag_from_char.keys())))
  1330. return re.compile(s[1:idx], flags)
  1331. else: # not an encoded regex
  1332. return re.compile(re.escape(s))
  1333. # Recipe: text_escape (0.2)
  1334. def _escaped_text_from_text(text, escapes="eol"):
  1335. r"""Return escaped version of text.
  1336. "escapes" is either a mapping of chars in the source text to
  1337. replacement text for each such char or one of a set of
  1338. strings identifying a particular escape style:
  1339. eol
  1340. replace EOL chars with '\r' and '\n', maintain the actual
  1341. EOLs though too
  1342. whitespace
  1343. replace EOL chars as above, tabs with '\t' and spaces
  1344. with periods ('.')
  1345. eol-one-line
  1346. replace EOL chars with '\r' and '\n'
  1347. whitespace-one-line
  1348. replace EOL chars as above, tabs with '\t' and spaces
  1349. with periods ('.')
  1350. """
  1351. #TODO:
  1352. # - Add 'c-string' style.
  1353. # - Add _escaped_html_from_text() with a similar call sig.
  1354. import re
  1355. if isinstance(escapes, basestring):
  1356. if escapes == "eol":
  1357. escapes = {'\r\n': "\\r\\n\r\n", '\n': "\\n\n", '\r': "\\r\r"}
  1358. elif escapes == "whitespace":
  1359. escapes = {'\r\n': "\\r\\n\r\n", '\n': "\\n\n", '\r': "\\r\r",
  1360. '\t': "\\t", ' ': "."}
  1361. elif escapes == "eol-one-line":
  1362. escapes = {'\n': "\\n", '\r': "\\r"}
  1363. elif escapes == "whitespace-one-line":
  1364. escapes = {'\n': "\\n", '\r': "\\r", '\t': "\\t", ' ': '.'}
  1365. else:
  1366. raise ValueError("unknown text escape style: %r" % escapes)
  1367. # Sort longer replacements first to allow, e.g. '\r\n' to beat '\r' and
  1368. # '\n'.
  1369. escapes_keys = escapes.keys()
  1370. try:
  1371. escapes_keys.sort(key=lambda a: len(a), reverse=True)
  1372. except TypeError:
  1373. # Python 2.3 support: sort() takes no keyword arguments
  1374. escapes_keys.sort(lambda a,b: cmp(len(a), len(b)))
  1375. escapes_keys.reverse()
  1376. def repl(match):
  1377. val = escapes[match.group(0)]
  1378. return val
  1379. escaped = re.sub("(%s)" % '|'.join([re.escape(k) for k in escapes_keys]),
  1380. repl,
  1381. text)
  1382. return escaped
  1383. def _one_line_summary_from_text(text, length=78,
  1384. escapes={'\n':"\\n", '\r':"\\r", '\t':"\\t"}):
  1385. r"""Summarize the given text with one line of the given length.
  1386. "text" is the text to summarize
  1387. "length" (default 78) is the max length for the summary
  1388. "escapes" is a mapping of chars in the source text to
  1389. replacement text for each such char. By default '\r', '\n'
  1390. and '\t' are escaped with their '\'-escaped repr.
  1391. """
  1392. if len(text) > length:
  1393. head = text[:length-3]
  1394. else:
  1395. head = text
  1396. escaped = _escaped_text_from_text(head, escapes)
  1397. if len(text) > length:
  1398. summary = escaped[:length-3] + "..."
  1399. else:
  1400. summary = escaped
  1401. return summary
  1402. # Recipe: paths_from_path_patterns (0.5)
  1403. def _should_include_path(path, includes, excludes):
  1404. """Return True iff the given path should be included."""
  1405. from os.path import basename
  1406. from fnmatch import fnmatch
  1407. base = basename(path)
  1408. if includes:
  1409. for include in includes:
  1410. if fnmatch(base, include):
  1411. try:
  1412. log.debug("include `%s' (matches `%s')", path, include)
  1413. except (NameError, AttributeError):
  1414. pass
  1415. break
  1416. else:
  1417. try:
  1418. log.debug("exclude `%s' (matches no includes)", path)
  1419. except (NameError, AttributeError):
  1420. pass
  1421. return False
  1422. for exclude in excludes:
  1423. if fnmatch(base, exclude):
  1424. try:
  1425. log.debug("exclude `%s' (matches `%s')", path, exclude)
  1426. except (NameError, AttributeError):
  1427. pass
  1428. return False
  1429. return True
  1430. def _walk(top, topdown=True, onerror=None, follow_symlinks=False):
  1431. """A version of `os.walk()` with a couple differences regarding symlinks.
  1432. 1. follow_symlinks=False (the default): A symlink to a dir is
  1433. returned as a *non*-dir. In `os.walk()`, a symlink to a dir is
  1434. returned in the *dirs* list, but it is not recursed into.
  1435. 2. follow_symlinks=True: A symlink to a dir is returned in the
  1436. *dirs* list (as with `os.walk()`) but it *is conditionally*
  1437. recursed into (unlike `os.walk()`).
  1438. A symlinked dir is only recursed into if it is to a deeper dir
  1439. within the same tree. This is my understanding of how `find -L
  1440. DIR` works.
  1441. TODO: put as a separate recipe
  1442. """
  1443. from os.path import join, isdir, islink, abspath
  1444. # We may not have read permission for top, in which case we can't
  1445. # get a list of the files the directory contains. os.path.walk
  1446. # always suppressed the exception then, rather than blow up for a
  1447. # minor reason when (say) a thousand readable directories are still
  1448. # left to visit. That logic is copied here.
  1449. try:
  1450. names = os.listdir(top)
  1451. except OSError, err:
  1452. if onerror is not None:
  1453. onerror(err)
  1454. return
  1455. dirs, nondirs = [], []
  1456. if follow_symlinks:
  1457. for name in names:
  1458. if isdir(join(top, name)):
  1459. dirs.append(name)
  1460. else:
  1461. nondirs.append(name)
  1462. else:
  1463. for name in names:
  1464. path = join(top, name)
  1465. if islink(path):
  1466. nondirs.append(name)
  1467. elif isdir(path):
  1468. dirs.append(name)
  1469. else:
  1470. nondirs.append(name)
  1471. if topdown:
  1472. yield top, dirs, nondirs
  1473. for name in dirs:
  1474. path = join(top, name)
  1475. if follow_symlinks and islink(path):
  1476. # Only walk this path if it links deeper in the same tree.
  1477. top_abs = abspath(top)
  1478. link_abs = abspath(join(top, os.readlink(path)))
  1479. if not link_abs.startswith(top_abs + os.sep):
  1480. continue
  1481. for x in _walk(path, topdown, onerror, follow_symlinks=follow_symlinks):
  1482. yield x
  1483. if not topdown:
  1484. yield top, dirs, nondirs
  1485. _NOT_SPECIFIED = ("NOT", "SPECIFIED")
  1486. def _paths_from_path_patterns(path_patterns, files=True, dirs="never",
  1487. recursive=True, includes=[], excludes=[],
  1488. skip_dupe_dirs=False,
  1489. follow_symlinks=False,
  1490. on_error=_NOT_SPECIFIED):
  1491. """_paths_from_path_patterns([<path-patterns>, ...]) -> file paths
  1492. Generate a list of paths (files and/or dirs) represented by the given path
  1493. patterns.
  1494. "path_patterns" is a list of paths optionally using the '*', '?' and
  1495. '[seq]' glob patterns.
  1496. "files" is boolean (default True) indicating if file paths
  1497. should be yielded
  1498. "dirs" is string indicating under what conditions dirs are
  1499. yielded. It must be one of:
  1500. never (default) never yield dirs
  1501. always yield all dirs matching given patterns
  1502. if-not-recursive only yield dirs for invocations when
  1503. recursive=False
  1504. See use cases below for more details.
  1505. "recursive" is boolean (default True) indicating if paths should
  1506. be recursively yielded under given dirs.
  1507. "includes" is a list of file patterns to include in recursive
  1508. searches.
  1509. "excludes" is a list of file and dir patterns to exclude.
  1510. (Note: This is slightly different than GNU grep's --exclude
  1511. option which only excludes *files*. I.e. you cannot exclude
  1512. a ".svn" dir.)
  1513. "skip_dupe_dirs" can be set True to watch for and skip
  1514. descending into a dir that has already been yielded. Note
  1515. that this currently does not dereference symlinks.
  1516. "follow_symlinks" is a boolean indicating whether to follow
  1517. symlinks (default False). To guard against infinite loops
  1518. with circular dir symlinks, only dir symlinks to *deeper*
  1519. are followed.
  1520. "on_error" is an error callback called when a given path pattern
  1521. matches nothing:
  1522. on_error(PATH_PATTERN)
  1523. If not specified, the default is look for a "log" global and
  1524. call:
  1525. log.error("`%s': No such file or directory")
  1526. Specify None to do nothing.
  1527. Typically this is useful for a command-line tool that takes a list
  1528. of paths as arguments. (For Unix-heads: the shell on Windows does
  1529. NOT expand glob chars, that is left to the app.)
  1530. Use case #1: like `grep -r`
  1531. {files=True, dirs='never', recursive=(if '-r' in opts)}
  1532. script FILE # yield FILE, else call on_error(FILE)
  1533. script DIR # yield nothing
  1534. script PATH* # yield all files matching PATH*; if none,
  1535. # call on_error(PATH*) callback
  1536. script -r DIR # yield files (not dirs) recursively under DIR
  1537. script -r PATH* # yield files matching PATH* and files recursively
  1538. # under dirs matching PATH*; if none, call
  1539. # on_error(PATH*) callback
  1540. Use case #2: like `file -r` (if it had a recursive option)
  1541. {files=True, dirs='if-not-recursive', recursive=(if '-r' in opts)}
  1542. script FILE # yield FILE, else call on_error(FILE)
  1543. script DIR # yield DIR, else call on_error(DIR)
  1544. script PATH* # yield all files and dirs matching PATH*; if none,
  1545. # call on_error(PATH*) callback
  1546. script -r DIR # yield files (not dirs) recursively under DIR
  1547. script -r PATH* # yield files matching PATH* and files recursively
  1548. # under dirs matching PATH*; if none, call
  1549. # on_error(PATH*) callback
  1550. Use case #3: kind of like `find .`
  1551. {files=True, dirs='always', recursive=(if '-r' in opts)}
  1552. script FILE # yield FILE, else call on_error(FILE)
  1553. script DIR # yield DIR, else call on_error(DIR)
  1554. script PATH* # yield all files and dirs matching PATH*; if none,
  1555. # call on_error(PATH*) callback
  1556. script -r DIR # yield files and dirs recursively under DIR
  1557. # (including DIR)
  1558. script -r PATH* # yield files and dirs matching PATH* and recursively
  1559. # under dirs; if none, call on_error(PATH*)
  1560. # callback
  1561. TODO: perf improvements (profile, stat just once)
  1562. """
  1563. from os.path import basename, exists, isdir, join, normpath, abspath, \
  1564. lexists, islink, realpath
  1565. from glob import glob
  1566. assert not isinstance(path_patterns, basestring), \
  1567. "'path_patterns' must be a sequence, not a string: %r" % path_patterns
  1568. GLOB_CHARS = '*?['
  1569. if skip_dupe_dirs:
  1570. searched_dirs = set()
  1571. for path_pattern in path_patterns:
  1572. # Determine the set of paths matching this path_pattern.
  1573. for glob_char in GLOB_CHARS:
  1574. if glob_char in path_pattern:
  1575. paths = glob(path_pattern)
  1576. break
  1577. else:
  1578. if follow_symlinks:
  1579. paths = exists(path_pattern) and [path_pattern] or []
  1580. else:
  1581. paths = lexists(path_pattern) and [path_pattern] or []
  1582. if not paths:
  1583. if on_error is None:
  1584. pass
  1585. elif on_error is _NOT_SPECIFIED:
  1586. try:
  1587. log.error("`%s': No such file or directory", path_pattern)
  1588. except (NameError, AttributeError):
  1589. pass
  1590. else:
  1591. on_error(path_pattern)
  1592. for path in paths:
  1593. if (follow_symlinks or not islink(path)) and isdir(path):
  1594. if skip_dupe_dirs:
  1595. canon_path = normpath(abspath(path))
  1596. if follow_symlinks:
  1597. canon_path = realpath(canon_path)
  1598. if canon_path in searched_dirs:
  1599. continue
  1600. else:
  1601. searched_dirs.add(canon_path)
  1602. # 'includes' SHOULD affect whether a dir is yielded.
  1603. if (dirs == "always"
  1604. or (dirs == "if-not-recursive" and not recursive)
  1605. ) and _should_include_path(path, includes, excludes):
  1606. yield path
  1607. # However, if recursive, 'includes' should NOT affect
  1608. # whether a dir is recursed into. Otherwise you could
  1609. # not:
  1610. # script -r --include="*.py" DIR
  1611. if recursive and _should_include_path(path, [], excludes):
  1612. for dirpath, dirnames, filenames in _walk(path,
  1613. follow_symlinks=follow_symlinks):
  1614. dir_indeces_to_remove = []
  1615. for i, dirname in enumerate(dirnames):
  1616. d = join(dirpath, dirname)
  1617. if skip_dupe_dirs:
  1618. canon_d = normpath(abspath(d))
  1619. if follow_symlinks:
  1620. canon_d = realpath(canon_d)
  1621. if canon_d in searched_dirs:
  1622. dir_indeces_to_remove.append(i)
  1623. continue
  1624. else:
  1625. searched_dirs.add(canon_d)
  1626. if dirs == "always" \
  1627. and _should_include_path(d, includes, excludes):
  1628. yield d
  1629. if not _should_include_path(d, [], excludes):
  1630. dir_indeces_to_remove.append(i)
  1631. for i in reversed(dir_indeces_to_remove):
  1632. del dirnames[i]
  1633. if files:
  1634. for filename in sorted(filenames):
  1635. f = join(dirpath, filename)
  1636. if _should_include_path(f, includes, excludes):
  1637. yield f
  1638. elif files and _should_include_path(path, includes, excludes):
  1639. yield path
  1640. class _NoReflowFormatter(optparse.IndentedHelpFormatter):
  1641. """An optparse formatter that does NOT reflow the description."""
  1642. def format_description(self, description):
  1643. return description or ""
  1644. # Recipe: pretty_logging (0.1) in C:\trentm\tm\recipes\cookbook
  1645. class _PerLevelFormatter(logging.Formatter):
  1646. """Allow multiple format string -- depending on the log level.
  1647. A "fmtFromLevel" optional arg is added to the constructor. It can be
  1648. a dictionary mapping a log record level to a format string. The
  1649. usual "fmt" argument acts as the default.
  1650. """
  1651. def __init__(self, fmt=None, datefmt=None, fmtFromLevel=None):
  1652. logging.Formatter.__init__(self, fmt, datefmt)
  1653. if fmtFromLevel is None:
  1654. self.fmtFromLevel = {}
  1655. else:
  1656. self.fmtFromLevel = fmtFromLevel
  1657. def format(self, record):
  1658. record.lowerlevelname = record.levelname.lower()
  1659. if record.levelno in self.fmtFromLevel:
  1660. #XXX This is a non-threadsafe HACK. Really the base Formatter
  1661. # class should provide a hook accessor for the _fmt
  1662. # attribute. *Could* add a lock guard here (overkill?).
  1663. _saved_fmt = self._fmt
  1664. self._fmt = self.fmtFromLevel[record.levelno]
  1665. try:
  1666. return logging.Formatter.format(self, record)
  1667. finally:
  1668. self._fmt = _saved_fmt
  1669. else:
  1670. return logging.Formatter.format(self, record)
  1671. def _setup_logging(stream=None):
  1672. """Do logging setup:
  1673. We want a prettier default format:
  1674. do: level: ...
  1675. Spacing. Lower case. Skip " level:" if INFO-level.
  1676. """
  1677. hdlr = logging.StreamHandler(stream)
  1678. defaultFmt = "%(name)s: %(levelname)s: %(message)s"
  1679. infoFmt = "%(name)s: %(message)s"
  1680. fmtr = _PerLevelFormatter(fmt=defaultFmt,
  1681. fmtFromLevel={logging.INFO: infoFmt})
  1682. hdlr.setFormatter(fmtr)
  1683. logging.root.addHandler(hdlr)
  1684. log.setLevel(logging.INFO)
  1685. #---- mainline
  1686. def main(argv):
  1687. usage = "usage: %prog PATHS..."
  1688. version = "%prog "+__version__
  1689. parser = optparse.OptionParser(usage=usage,
  1690. version=version, description=_cmdln_doc,
  1691. formatter=_NoReflowFormatter())
  1692. parser.add_option("-v", "--verbose", dest="log_level",
  1693. action="store_const", const=logging.DEBUG,
  1694. help="more verbose output")
  1695. parser.add_option("-q", "--quiet", dest="log_level",
  1696. action="store_const", const=logging.WARNING,
  1697. help="quieter output")
  1698. parser.add_option("-r", "--recursive", action="store_true",
  1699. help="recursively descend into given paths")
  1700. parser.add_option("-L", "--dereference", dest="follow_symlinks",
  1701. action="store_true",
  1702. help="follow symlinks, i.e. show info about linked-to "
  1703. "files and descend into linked dirs when recursive")
  1704. parser.add_option("-Q", "--quick-determine-lang", action="store_true",
  1705. help="Skip some processing to attempt to determine "
  1706. "language. Things like specialization, emacs/vi "
  1707. "local vars, full decoding, are skipped.")
  1708. parser.add_option("--encoding", help="suggested encoding for input files")
  1709. parser.add_option("-f", "--format",
  1710. help="format of output: summary (default), dict")
  1711. parser.add_option("-x", "--exclude", dest="excludes", action="append",
  1712. metavar="PATTERN",
  1713. help="path pattern to exclude for recursive search (by default SCC "
  1714. "control dirs are skipped)")
  1715. parser.set_defaults(log_level=logging.INFO, encoding=None, recursive=False,
  1716. follow_symlinks=False, format="summary",
  1717. excludes=[".svn", "CVS", ".hg", ".git", ".bzr"],
  1718. quick_determine_lang=False)
  1719. opts, args = parser.parse_args()
  1720. log.setLevel(opts.log_level)
  1721. if opts.log_level > logging.INFO:
  1722. warnings.simplefilter("ignore", ChardetImportWarning)
  1723. if args:
  1724. path_patterns = args
  1725. elif sys.stdin.isatty():
  1726. parser.print_help()
  1727. return 0
  1728. else:
  1729. def args_from_stdin():
  1730. for line in sys.stdin:
  1731. yield line.rstrip("\r\n")
  1732. path_patterns = args_from_stdin()
  1733. for path in _paths_from_path_patterns(path_patterns, excludes=opts.excludes,
  1734. recursive=opts.recursive,
  1735. dirs="if-not-recursive",
  1736. follow_symlinks=opts.follow_symlinks):
  1737. try:
  1738. ti = textinfo_from_path(path, encoding=opts.encoding,
  1739. follow_symlinks=opts.follow_symlinks,
  1740. quick_determine_lang=opts.quick_determine_lang)
  1741. except OSError, ex:
  1742. log.error("%s: %s", path, ex)
  1743. continue
  1744. if opts.format == "summary":
  1745. print ti.as_summary()
  1746. elif opts.format == "dict":
  1747. d = ti.as_dict()
  1748. if "text" in d:
  1749. del d["text"]
  1750. pprint(d)
  1751. else:
  1752. raise TextInfoError("unknown output format: %r" % opts.format)
  1753. if __name__ == "__main__":
  1754. _setup_logging()
  1755. try:
  1756. if "--self-test" in sys.argv:
  1757. import doctest
  1758. retval = doctest.testmod()[0]
  1759. else:
  1760. retval = main(sys.argv)
  1761. except SystemExit:
  1762. pass
  1763. except KeyboardInterrupt:
  1764. sys.exit(1)
  1765. except:
  1766. exc_info = sys.exc_info()
  1767. if log.isEnabledFor(logging.DEBUG):
  1768. import traceback
  1769. print
  1770. traceback.print_exception(*exc_info)
  1771. else:
  1772. if hasattr(exc_info[0], "__name__"):
  1773. #log.error("%s: %s", exc_info[0].__name__, exc_info[1])
  1774. log.error(exc_info[1])
  1775. else: # string exception
  1776. log.error(exc_info[0])
  1777. sys.exit(1)
  1778. else:
  1779. sys.exit(retval)