PageRenderTime 63ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/HarFileSystem.java

http://github.com/apache/hadoop-common
Java | 1270 lines | 813 code | 132 blank | 325 comment | 106 complexity | 8111d511442dd82d12ab36b29e8d8df7 MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.fs;
  19. import org.apache.commons.logging.Log;
  20. import org.apache.commons.logging.LogFactory;
  21. import org.apache.hadoop.conf.Configuration;
  22. import org.apache.hadoop.fs.permission.FsPermission;
  23. import org.apache.hadoop.io.IOUtils;
  24. import org.apache.hadoop.io.Text;
  25. import org.apache.hadoop.util.LineReader;
  26. import org.apache.hadoop.util.Progressable;
  27. import java.io.FileNotFoundException;
  28. import java.io.IOException;
  29. import java.io.UnsupportedEncodingException;
  30. import java.net.URI;
  31. import java.net.URISyntaxException;
  32. import java.net.URLDecoder;
  33. import java.util.*;
  34. /**
  35. * This is an implementation of the Hadoop Archive
  36. * Filesystem. This archive Filesystem has index files
  37. * of the form _index* and has contents of the form
  38. * part-*. The index files store the indexes of the
  39. * real files. The index files are of the form _masterindex
  40. * and _index. The master index is a level of indirection
  41. * in to the index file to make the look ups faster. the index
  42. * file is sorted with hash code of the paths that it contains
  43. * and the master index contains pointers to the positions in
  44. * index for ranges of hashcodes.
  45. */
  46. public class HarFileSystem extends FileSystem {
  47. private static final Log LOG = LogFactory.getLog(HarFileSystem.class);
  48. public static final String METADATA_CACHE_ENTRIES_KEY = "fs.har.metadatacache.entries";
  49. public static final int METADATA_CACHE_ENTRIES_DEFAULT = 10;
  50. public static final int VERSION = 3;
  51. private static Map<URI, HarMetaData> harMetaCache;
  52. // uri representation of this Har filesystem
  53. private URI uri;
  54. // the top level path of the archive
  55. // in the underlying file system
  56. private Path archivePath;
  57. // the har auth
  58. private String harAuth;
  59. // pointer into the static metadata cache
  60. private HarMetaData metadata;
  61. private FileSystem fs;
  62. /**
  63. * public construction of harfilesystem
  64. */
  65. public HarFileSystem() {
  66. // Must call #initialize() method to set the underlying file system
  67. }
  68. /**
  69. * Return the protocol scheme for the FileSystem.
  70. * <p/>
  71. *
  72. * @return <code>har</code>
  73. */
  74. @Override
  75. public String getScheme() {
  76. return "har";
  77. }
  78. /**
  79. * Constructor to create a HarFileSystem with an
  80. * underlying filesystem.
  81. * @param fs underlying file system
  82. */
  83. public HarFileSystem(FileSystem fs) {
  84. this.fs = fs;
  85. this.statistics = fs.statistics;
  86. }
  87. private synchronized void initializeMetadataCache(Configuration conf) {
  88. if (harMetaCache == null) {
  89. int cacheSize = conf.getInt(METADATA_CACHE_ENTRIES_KEY, METADATA_CACHE_ENTRIES_DEFAULT);
  90. harMetaCache = Collections.synchronizedMap(new LruCache<URI, HarMetaData>(cacheSize));
  91. }
  92. }
  93. /**
  94. * Initialize a Har filesystem per har archive. The
  95. * archive home directory is the top level directory
  96. * in the filesystem that contains the HAR archive.
  97. * Be careful with this method, you do not want to go
  98. * on creating new Filesystem instances per call to
  99. * path.getFileSystem().
  100. * the uri of Har is
  101. * har://underlyingfsscheme-host:port/archivepath.
  102. * or
  103. * har:///archivepath. This assumes the underlying filesystem
  104. * to be used in case not specified.
  105. */
  106. @Override
  107. public void initialize(URI name, Configuration conf) throws IOException {
  108. // initialize the metadata cache, if needed
  109. initializeMetadataCache(conf);
  110. // decode the name
  111. URI underLyingURI = decodeHarURI(name, conf);
  112. // we got the right har Path- now check if this is
  113. // truly a har filesystem
  114. Path harPath = archivePath(
  115. new Path(name.getScheme(), name.getAuthority(), name.getPath()));
  116. if (harPath == null) {
  117. throw new IOException("Invalid path for the Har Filesystem. " +
  118. name.toString());
  119. }
  120. if (fs == null) {
  121. fs = FileSystem.get(underLyingURI, conf);
  122. }
  123. uri = harPath.toUri();
  124. archivePath = new Path(uri.getPath());
  125. harAuth = getHarAuth(underLyingURI);
  126. //check for the underlying fs containing
  127. // the index file
  128. Path masterIndexPath = new Path(archivePath, "_masterindex");
  129. Path archiveIndexPath = new Path(archivePath, "_index");
  130. if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) {
  131. throw new IOException("Invalid path for the Har Filesystem. " +
  132. "No index file in " + harPath);
  133. }
  134. metadata = harMetaCache.get(uri);
  135. if (metadata != null) {
  136. FileStatus mStat = fs.getFileStatus(masterIndexPath);
  137. FileStatus aStat = fs.getFileStatus(archiveIndexPath);
  138. if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() ||
  139. aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) {
  140. // the archive has been overwritten since we last read it
  141. // remove the entry from the meta data cache
  142. metadata = null;
  143. harMetaCache.remove(uri);
  144. }
  145. }
  146. if (metadata == null) {
  147. metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath);
  148. metadata.parseMetaData();
  149. harMetaCache.put(uri, metadata);
  150. }
  151. }
  152. @Override
  153. public Configuration getConf() {
  154. return fs.getConf();
  155. }
  156. // get the version of the filesystem from the masterindex file
  157. // the version is currently not useful since its the first version
  158. // of archives
  159. public int getHarVersion() throws IOException {
  160. if (metadata != null) {
  161. return metadata.getVersion();
  162. }
  163. else {
  164. throw new IOException("Invalid meta data for the Har Filesystem");
  165. }
  166. }
  167. /*
  168. * find the parent path that is the
  169. * archive path in the path. The last
  170. * path segment that ends with .har is
  171. * the path that will be returned.
  172. */
  173. private Path archivePath(Path p) {
  174. Path retPath = null;
  175. Path tmp = p;
  176. for (int i=0; i< p.depth(); i++) {
  177. if (tmp.toString().endsWith(".har")) {
  178. retPath = tmp;
  179. break;
  180. }
  181. tmp = tmp.getParent();
  182. }
  183. return retPath;
  184. }
  185. /**
  186. * decode the raw URI to get the underlying URI
  187. * @param rawURI raw Har URI
  188. * @return filtered URI of the underlying fileSystem
  189. */
  190. private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException {
  191. String tmpAuth = rawURI.getAuthority();
  192. //we are using the default file
  193. //system in the config
  194. //so create a underlying uri and
  195. //return it
  196. if (tmpAuth == null) {
  197. //create a path
  198. return FileSystem.getDefaultUri(conf);
  199. }
  200. String authority = rawURI.getAuthority();
  201. if (authority == null) {
  202. throw new IOException("URI: " + rawURI
  203. + " is an invalid Har URI since authority==null."
  204. + " Expecting har://<scheme>-<host>/<path>.");
  205. }
  206. int i = authority.indexOf('-');
  207. if (i < 0) {
  208. throw new IOException("URI: " + rawURI
  209. + " is an invalid Har URI since '-' not found."
  210. + " Expecting har://<scheme>-<host>/<path>.");
  211. }
  212. if (rawURI.getQuery() != null) {
  213. // query component not allowed
  214. throw new IOException("query component in Path not supported " + rawURI);
  215. }
  216. URI tmp;
  217. try {
  218. // convert <scheme>-<host> to <scheme>://<host>
  219. URI baseUri = new URI(authority.replaceFirst("-", "://"));
  220. tmp = new URI(baseUri.getScheme(), baseUri.getAuthority(),
  221. rawURI.getPath(), rawURI.getQuery(), rawURI.getFragment());
  222. } catch (URISyntaxException e) {
  223. throw new IOException("URI: " + rawURI
  224. + " is an invalid Har URI. Expecting har://<scheme>-<host>/<path>.");
  225. }
  226. return tmp;
  227. }
  228. private static String decodeString(String str)
  229. throws UnsupportedEncodingException {
  230. return URLDecoder.decode(str, "UTF-8");
  231. }
  232. private String decodeFileName(String fname)
  233. throws UnsupportedEncodingException {
  234. int version = metadata.getVersion();
  235. if (version == 2 || version == 3){
  236. return decodeString(fname);
  237. }
  238. return fname;
  239. }
  240. /**
  241. * return the top level archive.
  242. */
  243. @Override
  244. public Path getWorkingDirectory() {
  245. return new Path(uri.toString());
  246. }
  247. @Override
  248. public Path getInitialWorkingDirectory() {
  249. return getWorkingDirectory();
  250. }
  251. @Override
  252. public FsStatus getStatus(Path p) throws IOException {
  253. return fs.getStatus(p);
  254. }
  255. /**
  256. * Create a har specific auth
  257. * har-underlyingfs:port
  258. * @param underLyingUri the uri of underlying
  259. * filesystem
  260. * @return har specific auth
  261. */
  262. private String getHarAuth(URI underLyingUri) {
  263. String auth = underLyingUri.getScheme() + "-";
  264. if (underLyingUri.getHost() != null) {
  265. if (underLyingUri.getUserInfo() != null) {
  266. auth += underLyingUri.getUserInfo();
  267. auth += "@";
  268. }
  269. auth += underLyingUri.getHost();
  270. if (underLyingUri.getPort() != -1) {
  271. auth += ":";
  272. auth += underLyingUri.getPort();
  273. }
  274. }
  275. else {
  276. auth += ":";
  277. }
  278. return auth;
  279. }
  280. /**
  281. * Used for delegation token related functionality. Must delegate to
  282. * underlying file system.
  283. */
  284. @Override
  285. protected URI getCanonicalUri() {
  286. return fs.getCanonicalUri();
  287. }
  288. @Override
  289. protected URI canonicalizeUri(URI uri) {
  290. return fs.canonicalizeUri(uri);
  291. }
  292. /**
  293. * Returns the uri of this filesystem.
  294. * The uri is of the form
  295. * har://underlyingfsschema-host:port/pathintheunderlyingfs
  296. */
  297. @Override
  298. public URI getUri() {
  299. return this.uri;
  300. }
  301. @Override
  302. protected void checkPath(Path path) {
  303. fs.checkPath(path);
  304. }
  305. @Override
  306. public Path resolvePath(Path p) throws IOException {
  307. return fs.resolvePath(p);
  308. }
  309. /**
  310. * this method returns the path
  311. * inside the har filesystem.
  312. * this is relative path inside
  313. * the har filesystem.
  314. * @param path the fully qualified path in the har filesystem.
  315. * @return relative path in the filesystem.
  316. */
  317. private Path getPathInHar(Path path) {
  318. Path harPath = new Path(path.toUri().getPath());
  319. if (archivePath.compareTo(harPath) == 0)
  320. return new Path(Path.SEPARATOR);
  321. Path tmp = new Path(harPath.getName());
  322. Path parent = harPath.getParent();
  323. while (!(parent.compareTo(archivePath) == 0)) {
  324. if (parent.toString().equals(Path.SEPARATOR)) {
  325. tmp = null;
  326. break;
  327. }
  328. tmp = new Path(parent.getName(), tmp);
  329. parent = parent.getParent();
  330. }
  331. if (tmp != null)
  332. tmp = new Path(Path.SEPARATOR, tmp);
  333. return tmp;
  334. }
  335. //the relative path of p. basically
  336. // getting rid of /. Parsing and doing
  337. // string manipulation is not good - so
  338. // just use the path api to do it.
  339. private Path makeRelative(String initial, Path p) {
  340. String scheme = this.uri.getScheme();
  341. String authority = this.uri.getAuthority();
  342. Path root = new Path(Path.SEPARATOR);
  343. if (root.compareTo(p) == 0)
  344. return new Path(scheme, authority, initial);
  345. Path retPath = new Path(p.getName());
  346. Path parent = p.getParent();
  347. for (int i=0; i < p.depth()-1; i++) {
  348. retPath = new Path(parent.getName(), retPath);
  349. parent = parent.getParent();
  350. }
  351. return new Path(new Path(scheme, authority, initial),
  352. retPath.toString());
  353. }
  354. /* this makes a path qualified in the har filesystem
  355. * (non-Javadoc)
  356. * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified(
  357. * org.apache.hadoop.fs.Path)
  358. */
  359. @Override
  360. public Path makeQualified(Path path) {
  361. // make sure that we just get the
  362. // path component
  363. Path fsPath = path;
  364. if (!path.isAbsolute()) {
  365. fsPath = new Path(archivePath, path);
  366. }
  367. URI tmpURI = fsPath.toUri();
  368. //change this to Har uri
  369. return new Path(uri.getScheme(), harAuth, tmpURI.getPath());
  370. }
  371. /**
  372. * Fix offset and length of block locations.
  373. * Note that this method modifies the original array.
  374. * @param locations block locations of har part file
  375. * @param start the start of the desired range in the contained file
  376. * @param len the length of the desired range
  377. * @param fileOffsetInHar the offset of the desired file in the har part file
  378. * @return block locations with fixed offset and length
  379. */
  380. static BlockLocation[] fixBlockLocations(BlockLocation[] locations,
  381. long start,
  382. long len,
  383. long fileOffsetInHar) {
  384. // offset 1 past last byte of desired range
  385. long end = start + len;
  386. for (BlockLocation location : locations) {
  387. // offset of part block relative to beginning of desired file
  388. // (may be negative if file starts in this part block)
  389. long harBlockStart = location.getOffset() - fileOffsetInHar;
  390. // offset 1 past last byte of har block relative to beginning of
  391. // desired file
  392. long harBlockEnd = harBlockStart + location.getLength();
  393. if (start > harBlockStart) {
  394. // desired range starts after beginning of this har block
  395. // fix offset to beginning of relevant range (relative to desired file)
  396. location.setOffset(start);
  397. // fix length to relevant portion of har block
  398. location.setLength(location.getLength() - (start - harBlockStart));
  399. } else {
  400. // desired range includes beginning of this har block
  401. location.setOffset(harBlockStart);
  402. }
  403. if (harBlockEnd > end) {
  404. // range ends before end of this har block
  405. // fix length to remove irrelevant portion at the end
  406. location.setLength(location.getLength() - (harBlockEnd - end));
  407. }
  408. }
  409. return locations;
  410. }
  411. /**
  412. * Get block locations from the underlying fs and fix their
  413. * offsets and lengths.
  414. * @param file the input file status to get block locations
  415. * @param start the start of the desired range in the contained file
  416. * @param len the length of the desired range
  417. * @return block locations for this segment of file
  418. * @throws IOException
  419. */
  420. @Override
  421. public BlockLocation[] getFileBlockLocations(FileStatus file, long start,
  422. long len) throws IOException {
  423. HarStatus hstatus = getFileHarStatus(file.getPath());
  424. Path partPath = new Path(archivePath, hstatus.getPartName());
  425. FileStatus partStatus = metadata.getPartFileStatus(partPath);
  426. // get all part blocks that overlap with the desired file blocks
  427. BlockLocation[] locations =
  428. fs.getFileBlockLocations(partStatus,
  429. hstatus.getStartIndex() + start, len);
  430. return fixBlockLocations(locations, start, len, hstatus.getStartIndex());
  431. }
  432. /**
  433. * the hash of the path p inside the filesystem
  434. * @param p the path in the harfilesystem
  435. * @return the hash code of the path.
  436. */
  437. public static int getHarHash(Path p) {
  438. return (p.toString().hashCode() & 0x7fffffff);
  439. }
  440. static class Store {
  441. public Store() {
  442. begin = end = startHash = endHash = 0;
  443. }
  444. public Store(long begin, long end, int startHash, int endHash) {
  445. this.begin = begin;
  446. this.end = end;
  447. this.startHash = startHash;
  448. this.endHash = endHash;
  449. }
  450. public long begin;
  451. public long end;
  452. public int startHash;
  453. public int endHash;
  454. }
  455. /**
  456. * Get filestatuses of all the children of a given directory. This just reads
  457. * through index file and reads line by line to get all statuses for children
  458. * of a directory. Its a brute force way of getting all such filestatuses
  459. *
  460. * @param parent
  461. * the parent path directory
  462. * @param statuses
  463. * the list to add the children filestatuses to
  464. */
  465. private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses)
  466. throws IOException {
  467. String parentString = parent.getName();
  468. if (!parentString.endsWith(Path.SEPARATOR)){
  469. parentString += Path.SEPARATOR;
  470. }
  471. Path harPath = new Path(parentString);
  472. int harlen = harPath.depth();
  473. final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>();
  474. for (HarStatus hstatus : metadata.archive.values()) {
  475. String child = hstatus.getName();
  476. if ((child.startsWith(parentString))) {
  477. Path thisPath = new Path(child);
  478. if (thisPath.depth() == harlen + 1) {
  479. statuses.add(toFileStatus(hstatus, cache));
  480. }
  481. }
  482. }
  483. }
  484. /**
  485. * Combine the status stored in the index and the underlying status.
  486. * @param h status stored in the index
  487. * @param cache caching the underlying file statuses
  488. * @return the combined file status
  489. * @throws IOException
  490. */
  491. private FileStatus toFileStatus(HarStatus h,
  492. Map<String, FileStatus> cache) throws IOException {
  493. FileStatus underlying = null;
  494. if (cache != null) {
  495. underlying = cache.get(h.partName);
  496. }
  497. if (underlying == null) {
  498. final Path p = h.isDir? archivePath: new Path(archivePath, h.partName);
  499. underlying = fs.getFileStatus(p);
  500. if (cache != null) {
  501. cache.put(h.partName, underlying);
  502. }
  503. }
  504. long modTime = 0;
  505. int version = metadata.getVersion();
  506. if (version < 3) {
  507. modTime = underlying.getModificationTime();
  508. } else if (version == 3) {
  509. modTime = h.getModificationTime();
  510. }
  511. return new FileStatus(
  512. h.isDir()? 0L: h.getLength(),
  513. h.isDir(),
  514. underlying.getReplication(),
  515. underlying.getBlockSize(),
  516. modTime,
  517. underlying.getAccessTime(),
  518. underlying.getPermission(),
  519. underlying.getOwner(),
  520. underlying.getGroup(),
  521. makeRelative(this.uri.getPath(), new Path(h.name)));
  522. }
  523. // a single line parser for hadoop archives status
  524. // stored in a single line in the index files
  525. // the format is of the form
  526. // filename "dir"/"file" partFileName startIndex length
  527. // <space separated children>
  528. private class HarStatus {
  529. boolean isDir;
  530. String name;
  531. List<String> children;
  532. String partName;
  533. long startIndex;
  534. long length;
  535. long modificationTime = 0;
  536. public HarStatus(String harString) throws UnsupportedEncodingException {
  537. String[] splits = harString.split(" ");
  538. this.name = decodeFileName(splits[0]);
  539. this.isDir = "dir".equals(splits[1]) ? true: false;
  540. // this is equal to "none" if its a directory
  541. this.partName = splits[2];
  542. this.startIndex = Long.parseLong(splits[3]);
  543. this.length = Long.parseLong(splits[4]);
  544. int version = metadata.getVersion();
  545. String[] propSplits = null;
  546. // propSplits is used to retrieve the metainformation that Har versions
  547. // 1 & 2 missed (modification time, permission, owner group).
  548. // These fields are stored in an encoded string placed in different
  549. // locations depending on whether it's a file or directory entry.
  550. // If it's a directory, the string will be placed at the partName
  551. // location (directories have no partName because they don't have data
  552. // to be stored). This is done because the number of fields in a
  553. // directory entry is unbounded (all children are listed at the end)
  554. // If it's a file, the string will be the last field.
  555. if (isDir) {
  556. if (version == 3){
  557. propSplits = decodeString(this.partName).split(" ");
  558. }
  559. children = new ArrayList<String>();
  560. for (int i = 5; i < splits.length; i++) {
  561. children.add(decodeFileName(splits[i]));
  562. }
  563. } else if (version == 3) {
  564. propSplits = decodeString(splits[5]).split(" ");
  565. }
  566. if (propSplits != null && propSplits.length >= 4) {
  567. modificationTime = Long.parseLong(propSplits[0]);
  568. // the fields below are stored in the file but are currently not used
  569. // by HarFileSystem
  570. // permission = new FsPermission(Short.parseShort(propSplits[1]));
  571. // owner = decodeString(propSplits[2]);
  572. // group = decodeString(propSplits[3]);
  573. }
  574. }
  575. public boolean isDir() {
  576. return isDir;
  577. }
  578. public String getName() {
  579. return name;
  580. }
  581. public String getPartName() {
  582. return partName;
  583. }
  584. public long getStartIndex() {
  585. return startIndex;
  586. }
  587. public long getLength() {
  588. return length;
  589. }
  590. public long getModificationTime() {
  591. return modificationTime;
  592. }
  593. }
  594. /**
  595. * return the filestatus of files in har archive.
  596. * The permission returned are that of the archive
  597. * index files. The permissions are not persisted
  598. * while creating a hadoop archive.
  599. * @param f the path in har filesystem
  600. * @return filestatus.
  601. * @throws IOException
  602. */
  603. @Override
  604. public FileStatus getFileStatus(Path f) throws IOException {
  605. HarStatus hstatus = getFileHarStatus(f);
  606. return toFileStatus(hstatus, null);
  607. }
  608. private HarStatus getFileHarStatus(Path f) throws IOException {
  609. // get the fs DataInputStream for the underlying file
  610. // look up the index.
  611. Path p = makeQualified(f);
  612. Path harPath = getPathInHar(p);
  613. if (harPath == null) {
  614. throw new IOException("Invalid file name: " + f + " in " + uri);
  615. }
  616. HarStatus hstatus = metadata.archive.get(harPath);
  617. if (hstatus == null) {
  618. throw new FileNotFoundException("File: " + f + " does not exist in " + uri);
  619. }
  620. return hstatus;
  621. }
  622. /**
  623. * @return null since no checksum algorithm is implemented.
  624. */
  625. @Override
  626. public FileChecksum getFileChecksum(Path f, long length) {
  627. return null;
  628. }
  629. /**
  630. * Returns a har input stream which fakes end of
  631. * file. It reads the index files to get the part
  632. * file name and the size and start of the file.
  633. */
  634. @Override
  635. public FSDataInputStream open(Path f, int bufferSize) throws IOException {
  636. // get the fs DataInputStream for the underlying file
  637. HarStatus hstatus = getFileHarStatus(f);
  638. if (hstatus.isDir()) {
  639. throw new FileNotFoundException(f + " : not a file in " +
  640. archivePath);
  641. }
  642. return new HarFSDataInputStream(fs, new Path(archivePath,
  643. hstatus.getPartName()),
  644. hstatus.getStartIndex(), hstatus.getLength(), bufferSize);
  645. }
  646. /**
  647. * Used for delegation token related functionality. Must delegate to
  648. * underlying file system.
  649. */
  650. @Override
  651. public FileSystem[] getChildFileSystems() {
  652. return new FileSystem[]{fs};
  653. }
  654. @Override
  655. public FSDataOutputStream create(Path f, FsPermission permission,
  656. boolean overwrite, int bufferSize, short replication, long blockSize,
  657. Progressable progress) throws IOException {
  658. throw new IOException("Har: create not allowed.");
  659. }
  660. @SuppressWarnings("deprecation")
  661. @Override
  662. public FSDataOutputStream createNonRecursive(Path f, boolean overwrite,
  663. int bufferSize, short replication, long blockSize, Progressable progress)
  664. throws IOException {
  665. throw new IOException("Har: create not allowed.");
  666. }
  667. @Override
  668. public FSDataOutputStream append(Path f, int bufferSize, Progressable progress) throws IOException {
  669. throw new IOException("Har: append not allowed.");
  670. }
  671. @Override
  672. public void close() throws IOException {
  673. super.close();
  674. if (fs != null) {
  675. try {
  676. fs.close();
  677. } catch(IOException ie) {
  678. //this might already be closed
  679. // ignore
  680. }
  681. }
  682. }
  683. /**
  684. * Not implemented.
  685. */
  686. @Override
  687. public boolean setReplication(Path src, short replication) throws IOException{
  688. throw new IOException("Har: setReplication not allowed");
  689. }
  690. @Override
  691. public boolean rename(Path src, Path dst) throws IOException {
  692. throw new IOException("Har: rename not allowed");
  693. }
  694. @Override
  695. public FSDataOutputStream append(Path f) throws IOException {
  696. throw new IOException("Har: append not allowed");
  697. }
  698. /**
  699. * Not implemented.
  700. */
  701. @Override
  702. public boolean delete(Path f, boolean recursive) throws IOException {
  703. throw new IOException("Har: delete not allowed");
  704. }
  705. /**
  706. * liststatus returns the children of a directory
  707. * after looking up the index files.
  708. */
  709. @Override
  710. public FileStatus[] listStatus(Path f) throws IOException {
  711. //need to see if the file is an index in file
  712. //get the filestatus of the archive directory
  713. // we will create fake filestatuses to return
  714. // to the client
  715. List<FileStatus> statuses = new ArrayList<FileStatus>();
  716. Path tmpPath = makeQualified(f);
  717. Path harPath = getPathInHar(tmpPath);
  718. HarStatus hstatus = metadata.archive.get(harPath);
  719. if (hstatus == null) {
  720. throw new FileNotFoundException("File " + f + " not found in " + archivePath);
  721. }
  722. if (hstatus.isDir()) {
  723. fileStatusesInIndex(hstatus, statuses);
  724. } else {
  725. statuses.add(toFileStatus(hstatus, null));
  726. }
  727. return statuses.toArray(new FileStatus[statuses.size()]);
  728. }
  729. /**
  730. * return the top level archive path.
  731. */
  732. @Override
  733. public Path getHomeDirectory() {
  734. return new Path(uri.toString());
  735. }
  736. @Override
  737. public void setWorkingDirectory(Path newDir) {
  738. //does nothing.
  739. }
  740. /**
  741. * not implemented.
  742. */
  743. @Override
  744. public boolean mkdirs(Path f, FsPermission permission) throws IOException {
  745. throw new IOException("Har: mkdirs not allowed");
  746. }
  747. /**
  748. * not implemented.
  749. */
  750. @Override
  751. public void copyFromLocalFile(boolean delSrc, boolean overwrite,
  752. Path src, Path dst) throws IOException {
  753. throw new IOException("Har: copyfromlocalfile not allowed");
  754. }
  755. @Override
  756. public void copyFromLocalFile(boolean delSrc, boolean overwrite,
  757. Path[] srcs, Path dst) throws IOException {
  758. throw new IOException("Har: copyfromlocalfile not allowed");
  759. }
  760. /**
  761. * copies the file in the har filesystem to a local file.
  762. */
  763. @Override
  764. public void copyToLocalFile(boolean delSrc, Path src, Path dst)
  765. throws IOException {
  766. FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf());
  767. }
  768. /**
  769. * not implemented.
  770. */
  771. @Override
  772. public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile)
  773. throws IOException {
  774. throw new IOException("Har: startLocalOutput not allowed");
  775. }
  776. /**
  777. * not implemented.
  778. */
  779. @Override
  780. public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile)
  781. throws IOException {
  782. throw new IOException("Har: completeLocalOutput not allowed");
  783. }
  784. /**
  785. * not implemented.
  786. */
  787. @Override
  788. public void setOwner(Path p, String username, String groupname)
  789. throws IOException {
  790. throw new IOException("Har: setowner not allowed");
  791. }
  792. @Override
  793. public void setTimes(Path p, long mtime, long atime) throws IOException {
  794. throw new IOException("Har: setTimes not allowed");
  795. }
  796. /**
  797. * Not implemented.
  798. */
  799. @Override
  800. public void setPermission(Path p, FsPermission permission)
  801. throws IOException {
  802. throw new IOException("Har: setPermission not allowed");
  803. }
  804. /**
  805. * Hadoop archives input stream. This input stream fakes EOF
  806. * since archive files are part of bigger part files.
  807. */
  808. private static class HarFSDataInputStream extends FSDataInputStream {
  809. /**
  810. * Create an input stream that fakes all the reads/positions/seeking.
  811. */
  812. private static class HarFsInputStream extends FSInputStream
  813. implements CanSetDropBehind, CanSetReadahead {
  814. private long position, start, end;
  815. //The underlying data input stream that the
  816. // underlying filesystem will return.
  817. private final FSDataInputStream underLyingStream;
  818. //one byte buffer
  819. private final byte[] oneBytebuff = new byte[1];
  820. HarFsInputStream(FileSystem fs, Path path, long start,
  821. long length, int bufferSize) throws IOException {
  822. if (length < 0) {
  823. throw new IllegalArgumentException("Negative length ["+length+"]");
  824. }
  825. underLyingStream = fs.open(path, bufferSize);
  826. underLyingStream.seek(start);
  827. // the start of this file in the part file
  828. this.start = start;
  829. // the position pointer in the part file
  830. this.position = start;
  831. // the end pointer in the part file
  832. this.end = start + length;
  833. }
  834. @Override
  835. public synchronized int available() throws IOException {
  836. long remaining = end - underLyingStream.getPos();
  837. if (remaining > Integer.MAX_VALUE) {
  838. return Integer.MAX_VALUE;
  839. }
  840. return (int) remaining;
  841. }
  842. @Override
  843. public synchronized void close() throws IOException {
  844. underLyingStream.close();
  845. super.close();
  846. }
  847. //not implemented
  848. @Override
  849. public void mark(int readLimit) {
  850. // do nothing
  851. }
  852. /**
  853. * reset is not implemented
  854. */
  855. @Override
  856. public void reset() throws IOException {
  857. throw new IOException("reset not implemented.");
  858. }
  859. @Override
  860. public synchronized int read() throws IOException {
  861. int ret = read(oneBytebuff, 0, 1);
  862. return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff);
  863. }
  864. // NB: currently this method actually never executed becusae
  865. // java.io.DataInputStream.read(byte[]) directly delegates to
  866. // method java.io.InputStream.read(byte[], int, int).
  867. // However, potentially it can be invoked, so leave it intact for now.
  868. @Override
  869. public synchronized int read(byte[] b) throws IOException {
  870. final int ret = read(b, 0, b.length);
  871. return ret;
  872. }
  873. /**
  874. *
  875. */
  876. @Override
  877. public synchronized int read(byte[] b, int offset, int len)
  878. throws IOException {
  879. int newlen = len;
  880. int ret = -1;
  881. if (position + len > end) {
  882. newlen = (int) (end - position);
  883. }
  884. // end case
  885. if (newlen == 0)
  886. return ret;
  887. ret = underLyingStream.read(b, offset, newlen);
  888. position += ret;
  889. return ret;
  890. }
  891. @Override
  892. public synchronized long skip(long n) throws IOException {
  893. long tmpN = n;
  894. if (tmpN > 0) {
  895. final long actualRemaining = end - position;
  896. if (tmpN > actualRemaining) {
  897. tmpN = actualRemaining;
  898. }
  899. underLyingStream.seek(tmpN + position);
  900. position += tmpN;
  901. return tmpN;
  902. }
  903. // NB: the contract is described in java.io.InputStream.skip(long):
  904. // this method returns the number of bytes actually skipped, so,
  905. // the return value should never be negative.
  906. return 0;
  907. }
  908. @Override
  909. public synchronized long getPos() throws IOException {
  910. return (position - start);
  911. }
  912. @Override
  913. public synchronized void seek(final long pos) throws IOException {
  914. validatePosition(pos);
  915. position = start + pos;
  916. underLyingStream.seek(position);
  917. }
  918. private void validatePosition(final long pos) throws IOException {
  919. if (pos < 0) {
  920. throw new IOException("Negative position: "+pos);
  921. }
  922. final long length = end - start;
  923. if (pos > length) {
  924. throw new IOException("Position behind the end " +
  925. "of the stream (length = "+length+"): " + pos);
  926. }
  927. }
  928. @Override
  929. public boolean seekToNewSource(long targetPos) throws IOException {
  930. // do not need to implement this
  931. // hdfs in itself does seektonewsource
  932. // while reading.
  933. return false;
  934. }
  935. /**
  936. * implementing position readable.
  937. */
  938. @Override
  939. public int read(long pos, byte[] b, int offset, int length)
  940. throws IOException {
  941. int nlength = length;
  942. if (start + nlength + pos > end) {
  943. // length corrected to the real remaining length:
  944. nlength = (int) (end - start - pos);
  945. }
  946. if (nlength <= 0) {
  947. // EOS:
  948. return -1;
  949. }
  950. return underLyingStream.read(pos + start , b, offset, nlength);
  951. }
  952. /**
  953. * position readable again.
  954. */
  955. @Override
  956. public void readFully(long pos, byte[] b, int offset, int length)
  957. throws IOException {
  958. if (start + length + pos > end) {
  959. throw new IOException("Not enough bytes to read.");
  960. }
  961. underLyingStream.readFully(pos + start, b, offset, length);
  962. }
  963. @Override
  964. public void readFully(long pos, byte[] b) throws IOException {
  965. readFully(pos, b, 0, b.length);
  966. }
  967. @Override
  968. public void setReadahead(Long readahead) throws IOException {
  969. underLyingStream.setReadahead(readahead);
  970. }
  971. @Override
  972. public void setDropBehind(Boolean dropBehind) throws IOException {
  973. underLyingStream.setDropBehind(dropBehind);
  974. }
  975. }
  976. /**
  977. * constructors for har input stream.
  978. * @param fs the underlying filesystem
  979. * @param p The path in the underlying filesystem
  980. * @param start the start position in the part file
  981. * @param length the length of valid data in the part file
  982. * @param bufsize the buffer size
  983. * @throws IOException
  984. */
  985. public HarFSDataInputStream(FileSystem fs, Path p, long start,
  986. long length, int bufsize) throws IOException {
  987. super(new HarFsInputStream(fs, p, start, length, bufsize));
  988. }
  989. }
  990. private class HarMetaData {
  991. private FileSystem fs;
  992. private int version;
  993. // the masterIndex of the archive
  994. private Path masterIndexPath;
  995. // the index file
  996. private Path archiveIndexPath;
  997. private long masterIndexTimestamp;
  998. private long archiveIndexTimestamp;
  999. List<Store> stores = new ArrayList<Store>();
  1000. Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>();
  1001. private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>();
  1002. public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) {
  1003. this.fs = fs;
  1004. this.masterIndexPath = masterIndexPath;
  1005. this.archiveIndexPath = archiveIndexPath;
  1006. }
  1007. public FileStatus getPartFileStatus(Path partPath) throws IOException {
  1008. FileStatus status;
  1009. status = partFileStatuses.get(partPath);
  1010. if (status == null) {
  1011. status = fs.getFileStatus(partPath);
  1012. partFileStatuses.put(partPath, status);
  1013. }
  1014. return status;
  1015. }
  1016. public long getMasterIndexTimestamp() {
  1017. return masterIndexTimestamp;
  1018. }
  1019. public long getArchiveIndexTimestamp() {
  1020. return archiveIndexTimestamp;
  1021. }
  1022. private int getVersion() {
  1023. return version;
  1024. }
  1025. private void parseMetaData() throws IOException {
  1026. Text line = new Text();
  1027. long read;
  1028. FSDataInputStream in = null;
  1029. LineReader lin = null;
  1030. try {
  1031. in = fs.open(masterIndexPath);
  1032. FileStatus masterStat = fs.getFileStatus(masterIndexPath);
  1033. masterIndexTimestamp = masterStat.getModificationTime();
  1034. lin = new LineReader(in, getConf());
  1035. read = lin.readLine(line);
  1036. // the first line contains the version of the index file
  1037. String versionLine = line.toString();
  1038. String[] arr = versionLine.split(" ");
  1039. version = Integer.parseInt(arr[0]);
  1040. // make it always backwards-compatible
  1041. if (this.version > HarFileSystem.VERSION) {
  1042. throw new IOException("Invalid version " +
  1043. this.version + " expected " + HarFileSystem.VERSION);
  1044. }
  1045. // each line contains a hashcode range and the index file name
  1046. String[] readStr;
  1047. while(read < masterStat.getLen()) {
  1048. int b = lin.readLine(line);
  1049. read += b;
  1050. readStr = line.toString().split(" ");
  1051. int startHash = Integer.parseInt(readStr[0]);
  1052. int endHash = Integer.parseInt(readStr[1]);
  1053. stores.add(new Store(Long.parseLong(readStr[2]),
  1054. Long.parseLong(readStr[3]), startHash,
  1055. endHash));
  1056. line.clear();
  1057. }
  1058. } catch (IOException ioe) {
  1059. LOG.warn("Encountered exception ", ioe);
  1060. throw ioe;
  1061. } finally {
  1062. IOUtils.cleanup(LOG, lin, in);
  1063. }
  1064. FSDataInputStream aIn = fs.open(archiveIndexPath);
  1065. try {
  1066. FileStatus archiveStat = fs.getFileStatus(archiveIndexPath);
  1067. archiveIndexTimestamp = archiveStat.getModificationTime();
  1068. LineReader aLin;
  1069. // now start reading the real index file
  1070. for (Store s: stores) {
  1071. read = 0;
  1072. aIn.seek(s.begin);
  1073. aLin = new LineReader(aIn, getConf());
  1074. while (read + s.begin < s.end) {
  1075. int tmp = aLin.readLine(line);
  1076. read += tmp;
  1077. String lineFeed = line.toString();
  1078. String[] parsed = lineFeed.split(" ");
  1079. parsed[0] = decodeFileName(parsed[0]);
  1080. archive.put(new Path(parsed[0]), new HarStatus(lineFeed));
  1081. line.clear();
  1082. }
  1083. }
  1084. } finally {
  1085. IOUtils.cleanup(LOG, aIn);
  1086. }
  1087. }
  1088. }
  1089. /*
  1090. * testing purposes only:
  1091. */
  1092. HarMetaData getMetadata() {
  1093. return metadata;
  1094. }
  1095. private static class LruCache<K, V> extends LinkedHashMap<K, V> {
  1096. private final int MAX_ENTRIES;
  1097. public LruCache(int maxEntries) {
  1098. super(maxEntries + 1, 1.0f, true);
  1099. MAX_ENTRIES = maxEntries;
  1100. }
  1101. @Override
  1102. protected boolean removeEldestEntry(Map.Entry<K, V> eldest) {
  1103. return size() > MAX_ENTRIES;
  1104. }
  1105. }
  1106. @SuppressWarnings("deprecation")
  1107. @Override
  1108. public FsServerDefaults getServerDefaults() throws IOException {
  1109. return fs.getServerDefaults();
  1110. }
  1111. @Override
  1112. public FsServerDefaults getServerDefaults(Path f) throws IOException {
  1113. return fs.getServerDefaults(f);
  1114. }
  1115. @Override
  1116. public long getUsed() throws IOException{
  1117. return fs.getUsed();
  1118. }
  1119. @SuppressWarnings("deprecation")
  1120. @Override
  1121. public long getDefaultBlockSize() {
  1122. return fs.getDefaultBlockSize();
  1123. }
  1124. @SuppressWarnings("deprecation")
  1125. @Override
  1126. public long getDefaultBlockSize(Path f) {
  1127. return fs.getDefaultBlockSize(f);
  1128. }
  1129. @SuppressWarnings("deprecation")
  1130. @Override
  1131. public short getDefaultReplication() {
  1132. return fs.getDefaultReplication();
  1133. }
  1134. @Override
  1135. public short getDefaultReplication(Path f) {
  1136. return fs.getDefaultReplication(f);
  1137. }
  1138. }