/Test2010/Library/IFilter.cs

# · C# · 886 lines · 453 code · 90 blank · 343 comment · 60 complexity · b96bebac5bad9f122254d31325f2fa86 MD5 · raw file

  1. using System;
  2. using System.Collections.Generic;
  3. using System.Text;
  4. using System.IO;
  5. using System.Runtime.InteropServices;
  6. using System.Runtime.InteropServices.ComTypes;
  7. using Microsoft.Win32;
  8. namespace Xnlab.Filio
  9. {
  10. #region IFilter
  11. [ComVisible(false)]
  12. [ComImport, InterfaceType(ComInterfaceType.InterfaceIsIUnknown), Guid("00000001-0000-0000-C000-000000000046")]
  13. internal interface IClassFactory
  14. {
  15. void CreateInstance([MarshalAs(UnmanagedType.Interface)] object pUnkOuter, ref Guid refiid, [MarshalAs(UnmanagedType.Interface)] out object ppunk);
  16. void LockServer(bool fLock);
  17. }
  18. /// <summary>
  19. /// Utility class to get a Class Factory for a certain Class ID
  20. /// by loading the dll that implements that class
  21. /// </summary>
  22. internal static class ComHelper
  23. {
  24. //DllGetClassObject fuction pointer signature
  25. private delegate int DllGetClassObject(ref Guid ClassId, ref Guid InterfaceId, [Out, MarshalAs(UnmanagedType.Interface)] out object ppunk);
  26. //Some win32 methods to load\unload dlls and get a function pointer
  27. private class Win32NativeMethods
  28. {
  29. [DllImport("kernel32.dll", CharSet = CharSet.Ansi)]
  30. public static extern IntPtr GetProcAddress(IntPtr hModule, string lpProcName);
  31. [DllImport("kernel32.dll")]
  32. public static extern bool FreeLibrary(IntPtr hModule);
  33. [DllImport("kernel32.dll")]
  34. public static extern IntPtr LoadLibrary(string lpFileName);
  35. }
  36. /// <summary>
  37. /// Holds a list of dll handles and unloads the dlls
  38. /// in the destructor
  39. /// </summary>
  40. private class DllList
  41. {
  42. private List<IntPtr> _dllList = new List<IntPtr>();
  43. public void AddDllHandle(IntPtr dllHandle)
  44. {
  45. lock (_dllList)
  46. {
  47. _dllList.Add(dllHandle);
  48. }
  49. }
  50. ~DllList()
  51. {
  52. foreach (IntPtr dllHandle in _dllList)
  53. {
  54. try
  55. {
  56. Win32NativeMethods.FreeLibrary(dllHandle);
  57. }
  58. catch { };
  59. }
  60. }
  61. }
  62. static DllList _dllList = new DllList();
  63. /// <summary>
  64. /// Gets a class factory for a specific COM Class ID.
  65. /// </summary>
  66. /// <param name="dllName">The dll where the COM class is implemented</param>
  67. /// <param name="filterPersistClass">The requested Class ID</param>
  68. /// <returns>IClassFactory instance used to create instances of that class</returns>
  69. internal static IClassFactory GetClassFactory(string dllName, string filterPersistClass)
  70. {
  71. //Load the class factory from the dll
  72. IClassFactory classFactory = GetClassFactoryFromDll(dllName, filterPersistClass);
  73. return classFactory;
  74. }
  75. private static IClassFactory GetClassFactoryFromDll(string dllName, string filterPersistClass)
  76. {
  77. //Load the dll
  78. IntPtr dllHandle = Win32NativeMethods.LoadLibrary(dllName);
  79. if (dllHandle == IntPtr.Zero)
  80. return null;
  81. //Keep a reference to the dll until the process\AppDomain dies
  82. _dllList.AddDllHandle(dllHandle);
  83. //Get a pointer to the DllGetClassObject function
  84. IntPtr dllGetClassObjectPtr = Win32NativeMethods.GetProcAddress(dllHandle, "DllGetClassObject");
  85. if (dllGetClassObjectPtr == IntPtr.Zero)
  86. return null;
  87. //Convert the function pointer to a .net delegate
  88. DllGetClassObject dllGetClassObject = (DllGetClassObject)Marshal.GetDelegateForFunctionPointer(dllGetClassObjectPtr, typeof(DllGetClassObject));
  89. //Call the DllGetClassObject to retreive a class factory for out Filter class
  90. Guid filterPersistGUID = new Guid(filterPersistClass);
  91. Guid IClassFactoryGUID = new Guid("00000001-0000-0000-C000-000000000046"); //IClassFactory class id
  92. Object unk;
  93. if (dllGetClassObject(ref filterPersistGUID, ref IClassFactoryGUID, out unk) != 0)
  94. return null;
  95. //Yippie! cast the returned object to IClassFactory
  96. return (unk as IClassFactory);
  97. }
  98. }
  99. /// <summary>
  100. /// FilterLoader finds the dll and ClassID of the COM object responsible
  101. /// for filtering a specific file extension.
  102. /// It then loads that dll, creates the appropriate COM object and returns
  103. /// a pointer to an IFilter instance
  104. /// </summary>
  105. /// <remarks>
  106. /// Developing IFilter Add-ins
  107. /// http://msdn2.microsoft.com/en-us/library/aa965717.aspx
  108. /// </remarks>
  109. static class FilterLoader
  110. {
  111. #region CacheEntry
  112. private class CacheEntry
  113. {
  114. public string DllName;
  115. public string ClassName;
  116. public CacheEntry(string dllName, string className)
  117. {
  118. DllName = dllName;
  119. ClassName = className;
  120. }
  121. }
  122. #endregion
  123. static Dictionary<string, CacheEntry> _cache = new Dictionary<string, CacheEntry>();
  124. #region Registry Read String helper
  125. static string ReadStrFromHKLM(string key)
  126. {
  127. return ReadStrFromHKLM(key, null);
  128. }
  129. static string ReadStrFromHKLM(string key, string value)
  130. {
  131. RegistryKey rk = Registry.LocalMachine.OpenSubKey(key);
  132. if (rk == null)
  133. return null;
  134. using (rk)
  135. {
  136. return (string)rk.GetValue(value);
  137. }
  138. }
  139. #endregion
  140. /// <summary>
  141. /// finds an IFilter implementation for a file type
  142. /// </summary>
  143. /// <param name="ext">The extension of the file</param>
  144. /// <returns>an IFilter instance used to retreive text from that file type</returns>
  145. private static IFilter LoadIFilter(string ext)
  146. {
  147. string dllName, filterPersistClass;
  148. //Find the dll and ClassID
  149. if (GetFilterDllAndClass(ext, out dllName, out filterPersistClass))
  150. {
  151. //load the dll and return an IFilter instance.
  152. return LoadFilterFromDll(dllName, filterPersistClass);
  153. }
  154. return null;
  155. }
  156. internal static IFilter LoadAndInitIFilter(string fileName)
  157. {
  158. return LoadAndInitIFilter(fileName, Path.GetExtension(fileName));
  159. }
  160. internal static IFilter LoadAndInitIFilter(string fileName, string extension)
  161. {
  162. IFilter filter = LoadIFilter(extension);
  163. if (filter == null)
  164. return null;
  165. IPersistFile persistFile = (filter as IPersistFile);
  166. if (persistFile != null)
  167. {
  168. persistFile.Load(fileName, 0);
  169. IFILTER_FLAGS flags;
  170. IFILTER_INIT iflags =
  171. IFILTER_INIT.CANON_HYPHENS |
  172. IFILTER_INIT.CANON_PARAGRAPHS |
  173. IFILTER_INIT.CANON_SPACES |
  174. IFILTER_INIT.APPLY_INDEX_ATTRIBUTES |
  175. IFILTER_INIT.HARD_LINE_BREAKS |
  176. IFILTER_INIT.FILTER_OWNED_VALUE_OK;
  177. iflags =
  178. IFILTER_INIT.CANON_HYPHENS |
  179. IFILTER_INIT.CANON_PARAGRAPHS |
  180. IFILTER_INIT.CANON_SPACES |
  181. IFILTER_INIT.HARD_LINE_BREAKS |
  182. IFILTER_INIT.FILTER_OWNED_VALUE_OK |
  183. IFILTER_INIT.APPLY_INDEX_ATTRIBUTES |
  184. IFILTER_INIT.APPLY_CRAWL_ATTRIBUTES |
  185. IFILTER_INIT.APPLY_OTHER_ATTRIBUTES |
  186. IFILTER_INIT.FILTER_OWNED_VALUE_OK; // added [CD]
  187. if (filter.Init(iflags, 0, IntPtr.Zero, out flags) == IFilterReturnCode.S_OK)
  188. return filter;
  189. }
  190. //If we failed to retreive an IPersistFile interface or to initialize
  191. //the filter, we release it and return null.
  192. Marshal.ReleaseComObject(filter);
  193. return null;
  194. }
  195. private static IFilter LoadFilterFromDll(string dllName, string filterPersistClass)
  196. {
  197. //Get a classFactory for our classID
  198. IClassFactory classFactory = ComHelper.GetClassFactory(dllName, filterPersistClass);
  199. if (classFactory == null)
  200. return null;
  201. //And create an IFilter instance using that class factory
  202. // http://msdn2.microsoft.com/en-us/library/aa965717.aspx
  203. // The first identifies all filters' persistent handler, IID_IFilter, which is {89BCB740-6119-101A-BCB7-00DD010655AF}. This CLSID is constant for all filters that implement IFilter.
  204. Guid IFilterGUID = new Guid("89BCB740-6119-101A-BCB7-00DD010655AF");
  205. Object obj;
  206. classFactory.CreateInstance(null, ref IFilterGUID, out obj);
  207. return (obj as IFilter);
  208. }
  209. private static bool GetFilterDllAndClass(string ext, out string dllName, out string filterPersistClass)
  210. {
  211. if (!GetFilterDllAndClassFromCache(ext, out dllName, out filterPersistClass))
  212. {
  213. string persistentHandlerClass;
  214. persistentHandlerClass = GetPersistentHandlerClass(ext, true);
  215. if (persistentHandlerClass != null)
  216. {
  217. GetFilterDllAndClassFromPersistentHandler(persistentHandlerClass,
  218. out dllName, out filterPersistClass);
  219. }
  220. AddExtensionToCache(ext, dllName, filterPersistClass);
  221. }
  222. return (dllName != null && filterPersistClass != null);
  223. }
  224. private static void AddExtensionToCache(string ext, string dllName, string filterPersistClass)
  225. {
  226. lock (_cache)
  227. {
  228. if (!_cache.ContainsKey(ext.ToLower()))
  229. _cache.Add(ext.ToLower(), new CacheEntry(dllName, filterPersistClass));
  230. }
  231. }
  232. private static bool GetFilterDllAndClassFromPersistentHandler(string persistentHandlerClass, out string dllName, out string filterPersistClass)
  233. {
  234. dllName = null;
  235. filterPersistClass = null;
  236. //Read the CLASS ID of the IFilter persistent handler
  237. filterPersistClass = ReadStrFromHKLM(@"Software\Classes\CLSID\" + persistentHandlerClass +
  238. @"\PersistentAddinsRegistered\{89BCB740-6119-101A-BCB7-00DD010655AF}");
  239. if (String.IsNullOrEmpty(filterPersistClass))
  240. return false;
  241. //Read the dll name
  242. dllName = ReadStrFromHKLM(@"Software\Classes\CLSID\" + filterPersistClass +
  243. @"\InprocServer32");
  244. return (!String.IsNullOrEmpty(dllName));
  245. }
  246. private static string GetPersistentHandlerClass(string ext, bool searchContentType)
  247. {
  248. //Try getting the info from the file extension
  249. string persistentHandlerClass = GetPersistentHandlerClassFromExtension(ext);
  250. if (String.IsNullOrEmpty(persistentHandlerClass))
  251. //try getting the info from the document type
  252. persistentHandlerClass = GetPersistentHandlerClassFromDocumentType(ext);
  253. if (searchContentType && String.IsNullOrEmpty(persistentHandlerClass))
  254. //Try getting the info from the Content Type
  255. persistentHandlerClass = GetPersistentHandlerClassFromContentType(ext);
  256. return persistentHandlerClass;
  257. }
  258. private static string GetPersistentHandlerClassFromContentType(string ext)
  259. {
  260. string contentType = ReadStrFromHKLM(@"Software\Classes\" + ext, "Content Type");
  261. if (String.IsNullOrEmpty(contentType))
  262. return null;
  263. string contentTypeExtension = ReadStrFromHKLM(@"Software\Classes\MIME\Database\Content Type\" + contentType,
  264. "Extension");
  265. if (ext.Equals(contentTypeExtension, StringComparison.CurrentCultureIgnoreCase))
  266. return null; //No need to look further. This extension does not have any persistent handler
  267. //We know the extension that is assciated with that content type. Simply try again with the new extension
  268. return GetPersistentHandlerClass(contentTypeExtension, false); //Don't search content type this time.
  269. }
  270. private static string GetPersistentHandlerClassFromDocumentType(string ext)
  271. {
  272. //Get the DocumentType of this file extension
  273. string docType = ReadStrFromHKLM(@"Software\Classes\" + ext);
  274. if (String.IsNullOrEmpty(docType))
  275. return null;
  276. //Get the Class ID for this document type
  277. string docClass = ReadStrFromHKLM(@"Software\Classes\" + docType + @"\CLSID");
  278. if (String.IsNullOrEmpty(docType))
  279. return null;
  280. //Now get the PersistentHandler for that Class ID
  281. return ReadStrFromHKLM(@"Software\Classes\CLSID\" + docClass + @"\PersistentHandler");
  282. }
  283. private static string GetPersistentHandlerClassFromExtension(string ext)
  284. {
  285. return ReadStrFromHKLM(@"Software\Classes\" + ext + @"\PersistentHandler");
  286. }
  287. private static bool GetFilterDllAndClassFromCache(string ext, out string dllName, out string filterPersistClass)
  288. {
  289. string lowerExt = ext.ToLower();
  290. lock (_cache)
  291. {
  292. CacheEntry cacheEntry;
  293. if (_cache.TryGetValue(lowerExt, out cacheEntry))
  294. {
  295. dllName = cacheEntry.DllName;
  296. filterPersistClass = cacheEntry.ClassName;
  297. return true;
  298. }
  299. }
  300. dllName = null;
  301. filterPersistClass = null;
  302. return false;
  303. }
  304. }
  305. /// <summary>
  306. /// Implements a TextReader that reads from an IFilter.
  307. /// </summary>
  308. internal class FilterReader : TextReader
  309. {
  310. IFilter _filter;
  311. private bool _done;
  312. private STAT_CHUNK _currentChunk;
  313. private bool _currentChunkValid;
  314. private char[] _charsLeftFromLastRead;
  315. public override void Close()
  316. {
  317. Dispose(true);
  318. GC.SuppressFinalize(this);
  319. }
  320. ~FilterReader()
  321. {
  322. Dispose(false);
  323. }
  324. protected override void Dispose(bool disposing)
  325. {
  326. if (_filter != null)
  327. Marshal.ReleaseComObject(_filter);
  328. }
  329. public override int Read(char[] array, int offset, int count)
  330. {
  331. int endOfChunksCount = 0;
  332. int charsRead = 0;
  333. while (!_done && charsRead < count)
  334. {
  335. if (_charsLeftFromLastRead != null)
  336. {
  337. int charsToCopy = (_charsLeftFromLastRead.Length < count - charsRead) ? _charsLeftFromLastRead.Length : count - charsRead;
  338. Array.Copy(_charsLeftFromLastRead, 0, array, offset + charsRead, charsToCopy);
  339. charsRead += charsToCopy;
  340. if (charsToCopy < _charsLeftFromLastRead.Length)
  341. {
  342. char[] tmp = new char[_charsLeftFromLastRead.Length - charsToCopy];
  343. Array.Copy(_charsLeftFromLastRead, charsToCopy, tmp, 0, tmp.Length);
  344. _charsLeftFromLastRead = tmp;
  345. }
  346. else
  347. _charsLeftFromLastRead = null;
  348. continue;
  349. };
  350. if (!_currentChunkValid)
  351. {
  352. IFilterReturnCode res = _filter.GetChunk(out _currentChunk);
  353. _currentChunkValid = (res == IFilterReturnCode.S_OK) && ((_currentChunk.flags & CHUNKSTATE.CHUNK_TEXT) != 0);
  354. if (res == IFilterReturnCode.FILTER_E_END_OF_CHUNKS)
  355. endOfChunksCount++;
  356. if (endOfChunksCount > 1)
  357. _done = true; //That's it. no more chuncks available
  358. }
  359. if (_currentChunkValid)
  360. {
  361. uint bufLength = (uint)(count - charsRead);
  362. if (bufLength < 8192)
  363. bufLength = 8192; //Read ahead
  364. char[] buffer = new char[bufLength];
  365. IFilterReturnCode res = _filter.GetText(ref bufLength, buffer);
  366. if (res == IFilterReturnCode.S_OK || res == IFilterReturnCode.FILTER_S_LAST_TEXT)
  367. {
  368. int cRead = (int)bufLength;
  369. if (cRead + charsRead > count)
  370. {
  371. int charsLeft = (cRead + charsRead - count);
  372. _charsLeftFromLastRead = new char[charsLeft];
  373. Array.Copy(buffer, cRead - charsLeft, _charsLeftFromLastRead, 0, charsLeft);
  374. cRead -= charsLeft;
  375. }
  376. else
  377. _charsLeftFromLastRead = null;
  378. Array.Copy(buffer, 0, array, offset + charsRead, cRead);
  379. charsRead += cRead;
  380. }
  381. if (res == IFilterReturnCode.FILTER_S_LAST_TEXT || res == IFilterReturnCode.FILTER_E_NO_MORE_TEXT)
  382. _currentChunkValid = false;
  383. }
  384. }
  385. return charsRead;
  386. }
  387. public FilterReader(string fileName)
  388. {
  389. _filter = FilterLoader.LoadAndInitIFilter(fileName);
  390. if (_filter == null)
  391. throw new ArgumentException("no filter defined for " + fileName);
  392. }
  393. }
  394. [StructLayout(LayoutKind.Sequential)]
  395. public struct FULLPROPSPEC
  396. {
  397. public Guid guidPropSet;
  398. public PROPSPEC psProperty;
  399. }
  400. [StructLayout(LayoutKind.Sequential)]
  401. internal struct FILTERREGION
  402. {
  403. public int idChunk;
  404. public int cwcStart;
  405. public int cwcExtent;
  406. }
  407. [StructLayout(LayoutKind.Explicit)]
  408. public struct PROPSPEC
  409. {
  410. [FieldOffset(0)]
  411. public int ulKind; // 0 - string used; 1 - PROPID
  412. [FieldOffset(4)]
  413. public int propid;
  414. [FieldOffset(4)]
  415. public IntPtr lpwstr;
  416. }
  417. [Flags]
  418. internal enum IFILTER_FLAGS
  419. {
  420. /// <summary>
  421. /// The caller should use the IPropertySetStorage and IPropertyStorage
  422. /// interfaces to locate additional properties.
  423. /// When this flag is set, properties available through COM
  424. /// enumerators should not be returned from IFilter.
  425. /// </summary>
  426. IFILTER_FLAGS_OLE_PROPERTIES = 1
  427. }
  428. /// <summary>
  429. /// Flags controlling the operation of the FileFilter
  430. /// instance.
  431. /// </summary>
  432. [Flags]
  433. internal enum IFILTER_INIT
  434. {
  435. NONE = 0,
  436. /// <summary>
  437. /// Paragraph breaks should be marked with the Unicode PARAGRAPH
  438. /// SEPARATOR (0x2029)
  439. /// </summary>
  440. CANON_PARAGRAPHS = 1,
  441. /// <summary>
  442. /// Soft returns, such as the newline character in Microsoft Word, should
  443. /// be replaced by hard returnsLINE SEPARATOR (0x2028). Existing hard
  444. /// returns can be doubled. A carriage return (0x000D), line feed (0x000A),
  445. /// or the carriage return and line feed in combination should be considered
  446. /// a hard return. The intent is to enable pattern-expression matches that
  447. /// match against observed line breaks.
  448. /// </summary>
  449. HARD_LINE_BREAKS = 2,
  450. /// <summary>
  451. /// Various word-processing programs have forms of hyphens that are not
  452. /// represented in the host character set, such as optional hyphens
  453. /// (appearing only at the end of a line) and nonbreaking hyphens. This flag
  454. /// indicates that optional hyphens are to be converted to nulls, and
  455. /// non-breaking hyphens are to be converted to normal hyphens (0x2010), or
  456. /// HYPHEN-MINUSES (0x002D).
  457. /// </summary>
  458. CANON_HYPHENS = 4,
  459. /// <summary>
  460. /// Just as the CANON_HYPHENS flag standardizes hyphens,
  461. /// this one standardizes spaces. All special space characters, such as
  462. /// nonbreaking spaces, are converted to the standard space character
  463. /// (0x0020).
  464. /// </summary>
  465. CANON_SPACES = 8,
  466. /// <summary>
  467. /// Indicates that the client wants text split into chunks representing
  468. /// public value-type properties.
  469. /// </summary>
  470. APPLY_INDEX_ATTRIBUTES = 16,
  471. /// <summary>
  472. /// Indicates that the client wants text split into chunks representing
  473. /// properties determined during the indexing process.
  474. /// </summary>
  475. APPLY_CRAWL_ATTRIBUTES = 256,
  476. /// <summary>
  477. /// Any properties not covered by the APPLY_INDEX_ATTRIBUTES
  478. /// and APPLY_CRAWL_ATTRIBUTES flags should be emitted.
  479. /// </summary>
  480. APPLY_OTHER_ATTRIBUTES = 32,
  481. /// <summary>
  482. /// Optimizes IFilter for indexing because the client calls the
  483. /// IFilter::Init method only once and does not call IFilter::BindRegion.
  484. /// This eliminates the possibility of accessing a chunk both before and
  485. /// after accessing another chunk.
  486. /// </summary>
  487. INDEXING_ONLY = 64,
  488. /// <summary>
  489. /// The text extraction process must recursively search all linked
  490. /// objects within the document. If a link is unavailable, the
  491. /// IFilter::GetChunk call that would have obtained the first chunk of the
  492. /// link should return FILTER_E_LINK_UNAVAILABLE.
  493. /// </summary>
  494. SEARCH_LINKS = 128,
  495. /// <summary>
  496. /// The content indexing process can return property values set by the filter.
  497. /// </summary>
  498. FILTER_OWNED_VALUE_OK = 512
  499. }
  500. public struct STAT_CHUNK
  501. {
  502. /// <summary>
  503. /// The chunk identifier. Chunk identifiers must be unique for the
  504. /// current instance of the IFilter interface.
  505. /// Chunk identifiers must be in ascending order. The order in which
  506. /// chunks are numbered should correspond to the order in which they appear
  507. /// in the source document. Some search engines can take advantage of the
  508. /// proximity of chunks of various properties. If so, the order in which
  509. /// chunks with different properties are emitted will be important to the
  510. /// search engine.
  511. /// </summary>
  512. public int idChunk;
  513. /// <summary>
  514. /// The type of break that separates the previous chunk from the current
  515. /// chunk. Values are from the CHUNK_BREAKTYPE enumeration.
  516. /// </summary>
  517. [MarshalAs(UnmanagedType.U4)]
  518. public CHUNK_BREAKTYPE breakType;
  519. /// <summary>
  520. /// Flags indicate whether this chunk contains a text-type or a
  521. /// value-type property.
  522. /// Flag values are taken from the CHUNKSTATE enumeration. If the CHUNK_TEXT flag is set,
  523. /// IFilter::GetText should be used to retrieve the contents of the chunk
  524. /// as a series of words.
  525. /// If the CHUNK_VALUE flag is set, IFilter::GetValue should be used to retrieve
  526. /// the value and treat it as a single property value. If the filter dictates that the same
  527. /// content be treated as both text and as a value, the chunk should be emitted twice in two
  528. /// different chunks, each with one flag set.
  529. /// </summary>
  530. [MarshalAs(UnmanagedType.U4)]
  531. public CHUNKSTATE flags;
  532. /// <summary>
  533. /// The language and sublanguage associated with a chunk of text. Chunk locale is used
  534. /// by document indexers to perform proper word breaking of text. If the chunk is
  535. /// neither text-type nor a value-type with data type VT_LPWSTR, VT_LPSTR or VT_BSTR,
  536. /// this field is ignored.
  537. /// </summary>
  538. public int locale;
  539. /// <summary>
  540. /// The property to be applied to the chunk. If a filter requires that the same text
  541. /// have more than one property, it needs to emit the text once for each property
  542. /// in separate chunks.
  543. /// </summary>
  544. public FULLPROPSPEC attribute;
  545. /// <summary>
  546. /// The ID of the source of a chunk. The value of the idChunkSource
  547. /// member depends on the nature of the chunk:
  548. /// If the chunk is a text-type property, the value of the idChunkSource
  549. /// member must be the same as the value of the idChunk member.
  550. /// If the chunk is an public value-type property derived from textual
  551. /// content, the value of the idChunkSource member is the chunk ID for the
  552. /// text-type chunk from which it is derived.
  553. /// If the filter attributes specify to return only public value-type
  554. /// properties, there is no content chunk from which to derive the current
  555. /// public value-type property. In this case, the value of the
  556. /// idChunkSource member must be set to zero, which is an invalid chunk.
  557. /// </summary>
  558. public int idChunkSource;
  559. /// <summary>
  560. /// The offset from which the source text for a derived chunk starts in
  561. /// the source chunk.
  562. /// </summary>
  563. public int cwcStartSource;
  564. /// <summary>
  565. /// The length in characters of the source text from which the current
  566. /// chunk was derived.
  567. /// A zero value signifies character-by-character correspondence between
  568. /// the source text and
  569. /// the derived text. A nonzero value means that no such direct
  570. /// correspondence exists
  571. /// </summary>
  572. public int cwcLenSource;
  573. }
  574. /// <summary>
  575. /// Enumerates the different breaking types that occur between
  576. /// chunks of text read out by the FileFilter.
  577. /// </summary>
  578. public enum CHUNK_BREAKTYPE
  579. {
  580. /// <summary>
  581. /// No break is placed between the current chunk and the previous chunk.
  582. /// The chunks are glued together.
  583. /// </summary>
  584. CHUNK_NO_BREAK = 0,
  585. /// <summary>
  586. /// A word break is placed between this chunk and the previous chunk that
  587. /// had the same attribute.
  588. /// Use of CHUNK_EOW should be minimized because the choice of word
  589. /// breaks is language-dependent,
  590. /// so determining word breaks is best left to the search engine.
  591. /// </summary>
  592. CHUNK_EOW = 1,
  593. /// <summary>
  594. /// A sentence break is placed between this chunk and the previous chunk
  595. /// that had the same attribute.
  596. /// </summary>
  597. CHUNK_EOS = 2,
  598. /// <summary>
  599. /// A paragraph break is placed between this chunk and the previous chunk
  600. /// that had the same attribute.
  601. /// </summary>
  602. CHUNK_EOP = 3,
  603. /// <summary>
  604. /// A chapter break is placed between this chunk and the previous chunk
  605. /// that had the same attribute.
  606. /// </summary>
  607. CHUNK_EOC = 4
  608. }
  609. public enum CHUNKSTATE
  610. {
  611. /// <summary>
  612. /// The current chunk is a text-type property.
  613. /// </summary>
  614. CHUNK_TEXT = 0x1,
  615. /// <summary>
  616. /// The current chunk is a value-type property.
  617. /// </summary>
  618. CHUNK_VALUE = 0x2,
  619. /// <summary>
  620. /// Reserved
  621. /// </summary>
  622. CHUNK_FILTER_OWNED_VALUE = 0x4
  623. }
  624. internal enum IFilterReturnCode : uint
  625. {
  626. /// <summary>
  627. /// Success
  628. /// </summary>
  629. S_OK = 0,
  630. /// <summary>
  631. /// The function was denied access to the filter file.
  632. /// </summary>
  633. E_ACCESSDENIED = 0x80070005,
  634. /// <summary>
  635. /// The function encountered an invalid handle,
  636. /// probably due to a low-memory situation.
  637. /// </summary>
  638. E_HANDLE = 0x80070006,
  639. /// <summary>
  640. /// The function received an invalid parameter.
  641. /// </summary>
  642. E_INVALIDARG = 0x80070057,
  643. /// <summary>
  644. /// Out of memory
  645. /// </summary>
  646. E_OUTOFMEMORY = 0x8007000E,
  647. /// <summary>
  648. /// Not implemented
  649. /// </summary>
  650. E_NOTIMPL = 0x80004001,
  651. /// <summary>
  652. /// Unknown error
  653. /// </summary>
  654. E_FAIL = 0x80000008,
  655. /// <summary>
  656. /// File not filtered due to password protection
  657. /// </summary>
  658. FILTER_E_PASSWORD = 0x8004170B,
  659. /// <summary>
  660. /// The document format is not recognised by the filter
  661. /// </summary>
  662. FILTER_E_UNKNOWNFORMAT = 0x8004170C,
  663. /// <summary>
  664. /// No text in current chunk
  665. /// </summary>
  666. FILTER_E_NO_TEXT = 0x80041705,
  667. /// <summary>
  668. /// No more chunks of text available in object
  669. /// </summary>
  670. FILTER_E_END_OF_CHUNKS = 0x80041700,
  671. /// <summary>
  672. /// No more text available in chunk
  673. /// </summary>
  674. FILTER_E_NO_MORE_TEXT = 0x80041701,
  675. /// <summary>
  676. /// No more property values available in chunk
  677. /// </summary>
  678. FILTER_E_NO_MORE_VALUES = 0x80041702,
  679. /// <summary>
  680. /// Unable to access object
  681. /// </summary>
  682. FILTER_E_ACCESS = 0x80041703,
  683. /// <summary>
  684. /// Moniker doesn't cover entire region
  685. /// </summary>
  686. FILTER_W_MONIKER_CLIPPED = 0x00041704,
  687. /// <summary>
  688. /// Unable to bind IFilter for embedded object
  689. /// </summary>
  690. FILTER_E_EMBEDDING_UNAVAILABLE = 0x80041707,
  691. /// <summary>
  692. /// Unable to bind IFilter for linked object
  693. /// </summary>
  694. FILTER_E_LINK_UNAVAILABLE = 0x80041708,
  695. /// <summary>
  696. /// This is the last text in the current chunk
  697. /// </summary>
  698. FILTER_S_LAST_TEXT = 0x00041709,
  699. /// <summary>
  700. /// This is the last value in the current chunk
  701. /// </summary>
  702. FILTER_S_LAST_VALUES = 0x0004170A
  703. }
  704. [ComImport, Guid("89BCB740-6119-101A-BCB7-00DD010655AF")]
  705. [InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
  706. internal interface IFilter
  707. {
  708. /// <summary>
  709. /// The IFilter::Init method initializes a filtering session.
  710. /// </summary>
  711. [PreserveSig]
  712. IFilterReturnCode Init(
  713. //[in] Flag settings from the IFILTER_INIT enumeration for
  714. // controlling text standardization, property output, embedding
  715. // scope, and IFilter access patterns.
  716. IFILTER_INIT grfFlags,
  717. // [in] The size of the attributes array. When nonzero, cAttributes
  718. // takes
  719. // precedence over attributes specified in grfFlags. If no
  720. // attribute flags
  721. // are specified and cAttributes is zero, the default is given by
  722. // the
  723. // PSGUID_STORAGE storage property set, which contains the date and
  724. // time
  725. // of the last write to the file, size, and so on; and by the
  726. // PID_STG_CONTENTS
  727. // 'contents' property, which maps to the main contents of the
  728. // file.
  729. // For more information about properties and property sets, see
  730. // Property Sets.
  731. int cAttributes,
  732. //[in] Array of pointers to FULLPROPSPEC structures for the
  733. // requested properties.
  734. // When cAttributes is nonzero, only the properties in aAttributes
  735. // are returned.
  736. IntPtr aAttributes,
  737. // [out] Information about additional properties available to the
  738. // caller; from the IFILTER_FLAGS enumeration.
  739. out IFILTER_FLAGS pdwFlags);
  740. /// <summary>
  741. /// The IFilter::GetChunk method positions the filter at the beginning
  742. /// of the next chunk,
  743. /// or at the first chunk if this is the first call to the GetChunk
  744. /// method, and returns a description of the current chunk.
  745. /// </summary>
  746. [PreserveSig]
  747. IFilterReturnCode GetChunk(out STAT_CHUNK pStat);
  748. /// <summary>
  749. /// The IFilter::GetText method retrieves text (text-type properties)
  750. /// from the current chunk,
  751. /// which must have a CHUNKSTATE enumeration value of CHUNK_TEXT.
  752. /// </summary>
  753. [PreserveSig]
  754. IFilterReturnCode GetText(
  755. // [in/out] On entry, the size of awcBuffer array in wide/Unicode
  756. // characters. On exit, the number of Unicode characters written to
  757. // awcBuffer.
  758. // Note that this value is not the number of bytes in the buffer.
  759. ref uint pcwcBuffer,
  760. // Text retrieved from the current chunk. Do not terminate the
  761. // buffer with a character.
  762. [Out(), MarshalAs(UnmanagedType.LPArray)]
  763. char[] awcBuffer);
  764. /// <summary>
  765. /// The IFilter::GetValue method retrieves a value (public
  766. /// value-type property) from a chunk,
  767. /// which must have a CHUNKSTATE enumeration value of CHUNK_VALUE.
  768. /// </summary>
  769. /// <remarks>
  770. /// IFilter::GetValue
  771. /// http://msdn2.microsoft.com/en-us/library/ms690927.aspx
  772. /// Filtering File Properties
  773. /// http://msdn2.microsoft.com/en-us/library/ms692552.aspx
  774. /// </remarks>
  775. [PreserveSig]
  776. int GetValue(
  777. // Allocate the PROPVARIANT structure with CoTaskMemAlloc. Some
  778. // PROPVARIANT
  779. // structures contain pointers, which can be freed by calling the
  780. // PropVariantClear function.
  781. // It is up to the caller of the GetValue method to call the
  782. // PropVariantClear method.
  783. // ref IntPtr ppPropValue
  784. // [MarshalAs(UnmanagedType.Struct)]
  785. ref IntPtr PropVal);
  786. /// <summary>
  787. /// The IFilter::BindRegion method retrieves an interface representing
  788. /// the specified portion of the object.
  789. /// Currently reserved for future use.
  790. /// </summary>
  791. [PreserveSig]
  792. int BindRegion(ref FILTERREGION origPos,
  793. ref Guid riid, ref object ppunk);
  794. }
  795. #endregion
  796. }