2017/11/19
可利用 Tika 來取得檔案的內文、Metadata 以及 MIME type
先準備 pom.xml,在 <dependency>
中加入:
<dependencies>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.16</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.16</version>
<!-- 不知為何 tika-parsers 會依賴於 cxf-rt-rs-client 和 quartz,在這個範例中我先把它排除掉 -->
<exclusions>
<exclusion>
<groupId>org.apache.cxf</groupId>
<artifactId>cxf-rt-rs-client</artifactId>
</exclusion>
<exclusion>
<groupId>org.quartz-scheduler</groupId>
<artifactId>quartz</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
</dependencies>
執行 mvn eclipse:eclipse
打開 Eclipse,並將剛剛建立的 project 匯入
建立 TikaTest.java
package idv.shunyi.tika;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.Reader;
import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata;
import org.junit.Before;
import org.junit.Test;
public class TikaTest {
private Tika tika;
private File file;
@Before
public void setUp() throws Exception {
tika = new Tika();
file = new File(TikaTest.class.getResource("/test.html").toURI());
}
@Test
public void testParse() throws Exception {
Metadata metadata = new Metadata();
try (InputStream in = new FileInputStream(file)) {
Reader reader = tika.parse(in, metadata);
try (BufferedReader br = new BufferedReader(reader)) {
// Print extracted text content
System.out.println("### extracted text content");
for (String text; (text = br.readLine()) != null;) {
System.out.println(text);
}
System.out.println();
// Print meta data
System.out.println("### metadata");
for (String name : metadata.names()) {
System.out.println(name + " = " + metadata.get(name));
}
System.out.println();
}
}
}
@Test
public void testParseToString() throws Exception {
System.out.println("### parse to string");
try (InputStream stream = new FileInputStream(file)) {
String text = tika.parseToString(stream);
System.out.println(text);
}
System.out.println();
}
@Test
public void testDetect() throws Exception {
System.out.println("### detect media type");
String mimeType = tika.detect(file);
System.out.println(mimeType);
}
}
執行結果如下:
### extracted text content
這是內文
### metadata
X-Parsed-By = org.apache.tika.parser.DefaultParser
dc:title = 這是標題
Content-Encoding = UTF-8
title = 這是標題
Content-Type = text/html; charset=UTF-8
### parse to string
這是內文
### detect media type
text/html