2017/11/19

可利用 Tika 來取得檔案的內文、Metadata 以及 MIME type

  1. 先準備 pom.xml,在 <dependency> 中加入:

    <dependencies>
        <dependency>
           <groupId>org.apache.tika</groupId>
           <artifactId>tika-core</artifactId>
           <version>1.16</version>
        </dependency>
        <dependency>
           <groupId>org.apache.tika</groupId>
           <artifactId>tika-parsers</artifactId>
           <version>1.16</version>
           <!-- 不知為何 tika-parsers 會依賴於 cxf-rt-rs-client 和 quartz,在這個範例中我先把它排除掉 -->
           <exclusions>
             <exclusion>
              <groupId>org.apache.cxf</groupId>
              <artifactId>cxf-rt-rs-client</artifactId>
             </exclusion>
             <exclusion>
              <groupId>org.quartz-scheduler</groupId>
              <artifactId>quartz</artifactId>
             </exclusion>
           </exclusions>
        </dependency>
        <dependency>
           <groupId>junit</groupId>
           <artifactId>junit</artifactId>
           <version>4.12</version>
           <scope>test</scope>
        </dependency>
    </dependencies>
    
  2. 執行 mvn eclipse:eclipse

  3. 打開 Eclipse,並將剛剛建立的 project 匯入

  4. 建立 TikaTest.java

    package idv.shunyi.tika;
    
    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.InputStream;
    import java.io.Reader;
    
    import org.apache.tika.Tika;
    import org.apache.tika.metadata.Metadata;
    import org.junit.Before;
    import org.junit.Test;
    
    public class TikaTest {
        private Tika tika;
        private File file;
    
        @Before
        public void setUp() throws Exception {
           tika = new Tika();
           file = new File(TikaTest.class.getResource("/test.html").toURI());
        }
    
        @Test
        public void testParse() throws Exception {
           Metadata metadata = new Metadata();
    
           try (InputStream in = new FileInputStream(file)) {
             Reader reader = tika.parse(in, metadata);
    
             try (BufferedReader br = new BufferedReader(reader)) {
    
              // Print extracted text content
              System.out.println("### extracted text content");
              for (String text; (text = br.readLine()) != null;) {
                  System.out.println(text);
              }
              System.out.println();
    
              // Print meta data
              System.out.println("### metadata");
              for (String name : metadata.names()) {
                  System.out.println(name + " = " + metadata.get(name));
              }
              System.out.println();
             }
           }
        }
    
        @Test
        public void testParseToString() throws Exception {
           System.out.println("### parse to string");
           try (InputStream stream = new FileInputStream(file)) {
             String text =  tika.parseToString(stream);
             System.out.println(text);
           }
           System.out.println();
        }
    
        @Test
        public void testDetect() throws Exception {
           System.out.println("### detect media type");
           String mimeType = tika.detect(file);
           System.out.println(mimeType);
        }
    }
    
  5. 執行結果如下:

    ### extracted text content
    
        這是內文
    
    ### metadata
    X-Parsed-By = org.apache.tika.parser.DefaultParser
    dc:title = 這是標題
    Content-Encoding = UTF-8
    title = 這是標題
    Content-Type = text/html; charset=UTF-8
    
    ### parse to string
    
        這是內文
    
    
    ### detect media type
    text/html