CaboCha
/home/taku/proj/cabocha/src/cabocha.h
Go to the documentation of this file.
00001 /* CaboCha -- Yet Another Japanese Dependency Parser
00002    $Id: cabocha.h 50 2009-05-03 08:25:36Z taku-ku $;
00003    Copyright(C) 2001-2008 Taku Kudo <taku@chasen.org>
00004 */
00005 #ifndef CABOCHA_CABOCHA_H_
00006 #define CABOCHA_CABOCHA_H_
00007 
00008 #ifdef __cplusplus
00009 extern "C" {
00010 #endif
00011 
00012 #include <stddef.h>
00013 
00014 #ifdef _WIN32
00015 #  ifdef DLL_EXPORT
00016 #    define CABOCHA_DLL_EXTERN    __declspec(dllexport)
00017 #  else
00018 #    ifdef  DLL_IMPORT
00019 #      define CABOCHA_DLL_EXTERN  __declspec(dllimport)
00020 #    endif
00021 #  endif
00022 #endif
00023 
00024 #ifndef CABOCHA_DLL_EXTERN
00025 #  define CABOCHA_DLL_EXTERN extern
00026 #endif
00027 
00028   enum cabocha_charset_t {
00029     EUC_JP, CP932, UTF8, ASCII
00030   };
00031   enum cabocha_posset_t  {
00032     IPA, JUMAN, UNIDIC
00033   };
00034 
00035   enum cabocha_format_t {
00036     FORMAT_TREE,
00037     FORMAT_LATTICE,
00038     FORMAT_TREE_LATTICE,
00039     FORMAT_XML,
00040     FORMAT_NONE
00041   };
00042 
00043   enum cabocha_input_layer_t {
00044     INPUT_RAW_SENTENCE,
00045     INPUT_POS,
00046     INPUT_CHUNK,
00047     INPUT_SELECTION,
00048     INPUT_DEP
00049   };
00050 
00051   enum cabocha_output_layer_t {
00052     OUTPUT_RAW_SENTENCE,
00053     OUTPUT_POS,
00054     OUTPUT_CHUNK,
00055     OUTPUT_SELECTION,
00056     OUTPUT_DEP
00057   };
00058 
00059   enum cabocha_parser_t {
00060     TRAIN_NE,
00061     TRAIN_CHUNK,
00062     TRAIN_DEP
00063   };
00064 
00065   struct cabocha_t;
00066   struct cabocha_tree_t;
00067   struct mecab_node_t;
00068 
00069   struct cabocha_chunk_t {
00070     int                    link;
00071     unsigned short int     head_pos;
00072     unsigned short int     func_pos;
00073     unsigned short int     token_size;
00074     size_t                 token_pos;
00075     float                  score;
00076     const char             **feature_list;
00077     unsigned short int     feature_list_size;
00078   };
00079 
00080   struct cabocha_token_t {
00081     const char              *surface;
00082     const char              *normalized_surface;
00083     const char              *feature;
00084     const char             **feature_list;
00085     unsigned short int      feature_list_size;
00086     const char              *ne;
00087     struct cabocha_chunk_t  *chunk;
00088   };
00089 
00090   typedef struct cabocha_t  cabocha_t;
00091   typedef struct cabocha_tree_t  cabocha_tree_t;
00092   typedef struct cabocha_chunk_t cabocha_chunk_t;
00093   typedef struct cabocha_token_t cabocha_token_t;
00094   typedef struct mecab_node_t mecab_node_t;
00095 
00096   typedef enum cabocha_charset_t cabocha_charset_t;
00097   typedef enum cabocha_posset_t cabocha_posset_t;
00098   typedef enum cabocha_format_t cabocha_format_t;
00099   typedef enum cabocha_input_layer_t cabocha_input_layer_t;
00100   typedef enum cabocha_output_layer_t cabocha_output_layer_t;
00101   typedef enum cabocha_parser_t cabocha_parser_t;
00102 
00103 #ifndef SWIG
00104   CABOCHA_DLL_EXTERN int                    cabocha_do(int argc, char **argv);
00105 
00106   /* parser */
00107   CABOCHA_DLL_EXTERN cabocha_t             *cabocha_new(int argc, char **argv);
00108   CABOCHA_DLL_EXTERN cabocha_t             *cabocha_new2(const char *arg);
00109   CABOCHA_DLL_EXTERN const char            *cabocha_strerror(cabocha_t* cabocha);
00110   CABOCHA_DLL_EXTERN const char            *cabocha_sparse_tostr(cabocha_t* cabocha,
00111                                                                  const char* str);
00112   CABOCHA_DLL_EXTERN const char            *cabocha_sparse_tostr2(cabocha_t* cabocha,
00113                                                                   const char* str, size_t lenght);
00114   CABOCHA_DLL_EXTERN const char            *cabocha_sparse_tostr3(cabocha_t* cabocha, const char* str, size_t length,
00115                                                                   char *output_str, size_t output_length);
00116   CABOCHA_DLL_EXTERN void                  cabocha_destroy(cabocha_t* cabocha);
00117   CABOCHA_DLL_EXTERN const cabocha_tree_t  *cabocha_sparse_totree(cabocha_t* cabocha, const char* str);
00118   CABOCHA_DLL_EXTERN const cabocha_tree_t  *cabocha_sparse_totree2(cabocha_t* cabocha, const char* str, size_t length);
00119 
00120   /* tree */
00121   CABOCHA_DLL_EXTERN cabocha_tree_t        *cabocha_tree_new();
00122   CABOCHA_DLL_EXTERN void                   cabocha_tree_destroy(cabocha_tree_t* tree);
00123   CABOCHA_DLL_EXTERN int                    cabocha_tree_empty(cabocha_tree_t* tree);
00124   CABOCHA_DLL_EXTERN void                   cabocha_tree_clear(cabocha_tree_t* tree);
00125   CABOCHA_DLL_EXTERN void                   cabocha_tree_clear_chunk(cabocha_tree_t* tree);
00126   CABOCHA_DLL_EXTERN size_t                 cabocha_tree_size(cabocha_tree_t* tree);
00127   CABOCHA_DLL_EXTERN size_t                 cabocha_tree_chunk_size(cabocha_tree_t* tree);
00128   CABOCHA_DLL_EXTERN size_t                 cabocha_tree_token_size(cabocha_tree_t* tree);
00129   CABOCHA_DLL_EXTERN const char            *cabocha_tree_sentence(cabocha_tree_t* tree);
00130   CABOCHA_DLL_EXTERN size_t                 cabocha_tree_sentence_size(cabocha_tree_t* tree);
00131   CABOCHA_DLL_EXTERN void                   cabocha_tree_set_sentence(cabocha_tree_t* tree,
00132                                                                       const char *sentence,
00133                                                                       size_t length);
00134   CABOCHA_DLL_EXTERN int                   cabocha_tree_read(cabocha_tree_t* tree,
00135                                                              const char *input,
00136                                                              size_t length,
00137                                                              cabocha_input_layer_t input_layer);
00138   CABOCHA_DLL_EXTERN int                   cabocha_tree_read_from_mecab_node(cabocha_tree_t* tree,
00139                                                                              const mecab_node_t *node);
00140 
00141   CABOCHA_DLL_EXTERN const cabocha_token_t *cabocha_tree_token(cabocha_tree_t* tree, size_t i);
00142   CABOCHA_DLL_EXTERN const cabocha_chunk_t *cabocha_tree_chunk(cabocha_tree_t* tree, size_t i);
00143 
00144   CABOCHA_DLL_EXTERN cabocha_token_t       *cabocha_tree_add_token(cabocha_tree_t* tree);
00145   CABOCHA_DLL_EXTERN cabocha_chunk_t       *cabocha_tree_add_chunk(cabocha_tree_t* tree);
00146 
00147   CABOCHA_DLL_EXTERN char                  *cabocha_tree_strdup(cabocha_tree_t* tree, const char *str);
00148   CABOCHA_DLL_EXTERN char                  *cabocha_tree_alloc(cabocha_tree_t* tree, size_t size);
00149 
00150   CABOCHA_DLL_EXTERN const char            *cabocha_tree_tostr(cabocha_tree_t* tree, cabocha_format_t format);
00151   CABOCHA_DLL_EXTERN const char            *cabocha_tree_tostr2(cabocha_tree_t* tree, cabocha_format_t format,
00152                                                                 char *str, size_t length);
00153 
00154   CABOCHA_DLL_EXTERN void                   cabocha_tree_set_charset(cabocha_tree_t* tree,
00155                                                                      cabocha_charset_t charset);
00156   CABOCHA_DLL_EXTERN cabocha_charset_t      cabocha_tree_charset(cabocha_tree_t* tree);
00157   CABOCHA_DLL_EXTERN void                   cabocha_tree_set_posset(cabocha_tree_t* tree,
00158                                                                     cabocha_posset_t posset);
00159   CABOCHA_DLL_EXTERN cabocha_posset_t       cabocha_tree_posset(cabocha_tree_t* tree);
00160   CABOCHA_DLL_EXTERN void                   cabocha_tree_set_output_layer(cabocha_tree_t* tree,
00161                                                                           cabocha_output_layer_t output_layer);
00162   CABOCHA_DLL_EXTERN cabocha_output_layer_t cabocha_tree_output_layer(cabocha_tree_t* tree);
00163 
00164   CABOCHA_DLL_EXTERN int                    cabocha_learn(int argc, char **argv);
00165   CABOCHA_DLL_EXTERN int                    cabocha_system_eval(int argc, char **argv);
00166   CABOCHA_DLL_EXTERN int                    cabocha_model_index(int argc, char **argv);
00167 #endif
00168 
00169 #ifdef __cplusplus
00170 }
00171 #endif
00172 
00173 /* for C++ */
00174 #ifdef __cplusplus
00175 
00176 namespace CaboCha {
00177 
00178 class Tree;
00179 typedef struct cabocha_chunk_t Chunk;
00180 typedef struct cabocha_token_t Token;
00181 
00182 typedef enum cabocha_charset_t CharsetType;
00183 typedef enum cabocha_posset_t PossetType;
00184 typedef enum cabocha_format_t FormatType;
00185 typedef enum cabocha_input_layer_t InputLayerType;
00186 typedef enum cabocha_output_layer_t OutputLayerType;
00187 typedef enum cabocha_parser_t ParserType;
00188 
00189 class TreeAllocator;
00190 
00191 class Tree {
00192  public:
00193   void set_sentence(const char *sentence);
00194   const char *sentence() const;
00195   size_t sentence_size() const;
00196 
00197 #ifndef SWIG
00198   void set_sentence(const char *sentence, size_t length);
00199 #endif
00200 
00201   const Chunk *chunk(size_t i) const;
00202   const Token *token(size_t i) const;
00203 
00204 #ifndef SWIG
00205   Chunk *mutable_chunk(size_t i);
00206   Token *mutable_token(size_t i);
00207 
00208   Token *add_token();
00209   Chunk *add_chunk();
00210 
00211   char *strdup(const char *str);
00212   char *alloc(size_t size);
00213   char **alloc_char_array(size_t size);
00214 
00215   TreeAllocator *allocator() const;
00216 #endif
00217 
00218   bool   read(const char *input,
00219               InputLayerType input_layer);
00220 
00221 #ifndef SWIG
00222   bool   read(const char *input, size_t length,
00223               InputLayerType input_layer);
00224   bool   read(const mecab_node_t *node);
00225 #endif
00226 
00227   bool   empty() const;
00228   void   clear();
00229   void   clear_chunk();
00230 
00231   size_t chunk_size() const;
00232   size_t token_size() const;
00233   size_t size() const;
00234 
00235   const char *toString(FormatType output_format);
00236 
00237 #ifndef SWIG
00238   const char *toString(FormatType output_format,
00239                        char *output, size_t length) const;
00240 #endif
00241 
00242   CharsetType charset() const { return charset_; }
00243   void set_charset(CharsetType charset) { charset_ = charset; }
00244   PossetType posset() const { return posset_; }
00245   void set_posset(PossetType posset) { posset_ = posset; }
00246   OutputLayerType output_layer() const { return output_layer_; }
00247   void set_output_layer(OutputLayerType output_layer) { output_layer_ = output_layer; }
00248 
00249   const char *what();
00250 
00251   explicit Tree();
00252   virtual ~Tree();
00253 
00254  private:
00255   TreeAllocator              *tree_allocator_;
00256   CharsetType                 charset_;
00257   PossetType                  posset_;
00258   OutputLayerType             output_layer_;
00259 };
00260 
00261 class Parser {
00262  public:
00263   virtual const Tree *parse(const char *input)                          = 0;
00264   virtual const char *parseToString(const char *input)                  = 0;
00265   virtual const Tree *parse(Tree *tree) const                           = 0;
00266 
00267 #ifndef SWIG
00268   virtual const Tree *parse(const char *input, size_t length)           = 0;
00269   virtual const char *parseToString(const char *input, size_t length)   = 0;
00270   virtual const char *parseToString(const char *input, size_t length,
00271                                     char       *output, size_t output_length) = 0;
00272 #endif
00273 
00274   virtual const char *what() = 0;
00275   static const char *version();
00276 
00277   virtual ~Parser() {}
00278 
00279 #ifndef SWIG
00280   static Parser *create(int argc, char **argv);
00281   static Parser *create(const char *arg);
00282 #endif
00283 };
00284 
00285 CABOCHA_DLL_EXTERN Parser *createParser(int argc, char **argv);
00286 CABOCHA_DLL_EXTERN Parser *createParser(const char *arg);
00287 CABOCHA_DLL_EXTERN const char *getParserError();
00288 CABOCHA_DLL_EXTERN const char *getLastError();
00289 }
00290 #endif
00291 #endif