CaboCha
|
00001 /* CaboCha -- Yet Another Japanese Dependency Parser 00002 $Id: cabocha.h 50 2009-05-03 08:25:36Z taku-ku $; 00003 Copyright(C) 2001-2008 Taku Kudo <taku@chasen.org> 00004 */ 00005 #ifndef CABOCHA_CABOCHA_H_ 00006 #define CABOCHA_CABOCHA_H_ 00007 00008 #ifdef __cplusplus 00009 extern "C" { 00010 #endif 00011 00012 #include <stddef.h> 00013 00014 #ifdef _WIN32 00015 # ifdef DLL_EXPORT 00016 # define CABOCHA_DLL_EXTERN __declspec(dllexport) 00017 # else 00018 # ifdef DLL_IMPORT 00019 # define CABOCHA_DLL_EXTERN __declspec(dllimport) 00020 # endif 00021 # endif 00022 #endif 00023 00024 #ifndef CABOCHA_DLL_EXTERN 00025 # define CABOCHA_DLL_EXTERN extern 00026 #endif 00027 00028 enum cabocha_charset_t { 00029 EUC_JP, CP932, UTF8, ASCII 00030 }; 00031 enum cabocha_posset_t { 00032 IPA, JUMAN, UNIDIC 00033 }; 00034 00035 enum cabocha_format_t { 00036 FORMAT_TREE, 00037 FORMAT_LATTICE, 00038 FORMAT_TREE_LATTICE, 00039 FORMAT_XML, 00040 FORMAT_NONE 00041 }; 00042 00043 enum cabocha_input_layer_t { 00044 INPUT_RAW_SENTENCE, 00045 INPUT_POS, 00046 INPUT_CHUNK, 00047 INPUT_SELECTION, 00048 INPUT_DEP 00049 }; 00050 00051 enum cabocha_output_layer_t { 00052 OUTPUT_RAW_SENTENCE, 00053 OUTPUT_POS, 00054 OUTPUT_CHUNK, 00055 OUTPUT_SELECTION, 00056 OUTPUT_DEP 00057 }; 00058 00059 enum cabocha_parser_t { 00060 TRAIN_NE, 00061 TRAIN_CHUNK, 00062 TRAIN_DEP 00063 }; 00064 00065 struct cabocha_t; 00066 struct cabocha_tree_t; 00067 struct mecab_node_t; 00068 00069 struct cabocha_chunk_t { 00070 int link; 00071 unsigned short int head_pos; 00072 unsigned short int func_pos; 00073 unsigned short int token_size; 00074 size_t token_pos; 00075 float score; 00076 const char **feature_list; 00077 unsigned short int feature_list_size; 00078 }; 00079 00080 struct cabocha_token_t { 00081 const char *surface; 00082 const char *normalized_surface; 00083 const char *feature; 00084 const char **feature_list; 00085 unsigned short int feature_list_size; 00086 const char *ne; 00087 struct cabocha_chunk_t *chunk; 00088 }; 00089 00090 typedef struct cabocha_t cabocha_t; 00091 typedef struct cabocha_tree_t cabocha_tree_t; 00092 typedef struct cabocha_chunk_t cabocha_chunk_t; 00093 typedef struct cabocha_token_t cabocha_token_t; 00094 typedef struct mecab_node_t mecab_node_t; 00095 00096 typedef enum cabocha_charset_t cabocha_charset_t; 00097 typedef enum cabocha_posset_t cabocha_posset_t; 00098 typedef enum cabocha_format_t cabocha_format_t; 00099 typedef enum cabocha_input_layer_t cabocha_input_layer_t; 00100 typedef enum cabocha_output_layer_t cabocha_output_layer_t; 00101 typedef enum cabocha_parser_t cabocha_parser_t; 00102 00103 #ifndef SWIG 00104 CABOCHA_DLL_EXTERN int cabocha_do(int argc, char **argv); 00105 00106 /* parser */ 00107 CABOCHA_DLL_EXTERN cabocha_t *cabocha_new(int argc, char **argv); 00108 CABOCHA_DLL_EXTERN cabocha_t *cabocha_new2(const char *arg); 00109 CABOCHA_DLL_EXTERN const char *cabocha_strerror(cabocha_t* cabocha); 00110 CABOCHA_DLL_EXTERN const char *cabocha_sparse_tostr(cabocha_t* cabocha, 00111 const char* str); 00112 CABOCHA_DLL_EXTERN const char *cabocha_sparse_tostr2(cabocha_t* cabocha, 00113 const char* str, size_t lenght); 00114 CABOCHA_DLL_EXTERN const char *cabocha_sparse_tostr3(cabocha_t* cabocha, const char* str, size_t length, 00115 char *output_str, size_t output_length); 00116 CABOCHA_DLL_EXTERN void cabocha_destroy(cabocha_t* cabocha); 00117 CABOCHA_DLL_EXTERN const cabocha_tree_t *cabocha_sparse_totree(cabocha_t* cabocha, const char* str); 00118 CABOCHA_DLL_EXTERN const cabocha_tree_t *cabocha_sparse_totree2(cabocha_t* cabocha, const char* str, size_t length); 00119 00120 /* tree */ 00121 CABOCHA_DLL_EXTERN cabocha_tree_t *cabocha_tree_new(); 00122 CABOCHA_DLL_EXTERN void cabocha_tree_destroy(cabocha_tree_t* tree); 00123 CABOCHA_DLL_EXTERN int cabocha_tree_empty(cabocha_tree_t* tree); 00124 CABOCHA_DLL_EXTERN void cabocha_tree_clear(cabocha_tree_t* tree); 00125 CABOCHA_DLL_EXTERN void cabocha_tree_clear_chunk(cabocha_tree_t* tree); 00126 CABOCHA_DLL_EXTERN size_t cabocha_tree_size(cabocha_tree_t* tree); 00127 CABOCHA_DLL_EXTERN size_t cabocha_tree_chunk_size(cabocha_tree_t* tree); 00128 CABOCHA_DLL_EXTERN size_t cabocha_tree_token_size(cabocha_tree_t* tree); 00129 CABOCHA_DLL_EXTERN const char *cabocha_tree_sentence(cabocha_tree_t* tree); 00130 CABOCHA_DLL_EXTERN size_t cabocha_tree_sentence_size(cabocha_tree_t* tree); 00131 CABOCHA_DLL_EXTERN void cabocha_tree_set_sentence(cabocha_tree_t* tree, 00132 const char *sentence, 00133 size_t length); 00134 CABOCHA_DLL_EXTERN int cabocha_tree_read(cabocha_tree_t* tree, 00135 const char *input, 00136 size_t length, 00137 cabocha_input_layer_t input_layer); 00138 CABOCHA_DLL_EXTERN int cabocha_tree_read_from_mecab_node(cabocha_tree_t* tree, 00139 const mecab_node_t *node); 00140 00141 CABOCHA_DLL_EXTERN const cabocha_token_t *cabocha_tree_token(cabocha_tree_t* tree, size_t i); 00142 CABOCHA_DLL_EXTERN const cabocha_chunk_t *cabocha_tree_chunk(cabocha_tree_t* tree, size_t i); 00143 00144 CABOCHA_DLL_EXTERN cabocha_token_t *cabocha_tree_add_token(cabocha_tree_t* tree); 00145 CABOCHA_DLL_EXTERN cabocha_chunk_t *cabocha_tree_add_chunk(cabocha_tree_t* tree); 00146 00147 CABOCHA_DLL_EXTERN char *cabocha_tree_strdup(cabocha_tree_t* tree, const char *str); 00148 CABOCHA_DLL_EXTERN char *cabocha_tree_alloc(cabocha_tree_t* tree, size_t size); 00149 00150 CABOCHA_DLL_EXTERN const char *cabocha_tree_tostr(cabocha_tree_t* tree, cabocha_format_t format); 00151 CABOCHA_DLL_EXTERN const char *cabocha_tree_tostr2(cabocha_tree_t* tree, cabocha_format_t format, 00152 char *str, size_t length); 00153 00154 CABOCHA_DLL_EXTERN void cabocha_tree_set_charset(cabocha_tree_t* tree, 00155 cabocha_charset_t charset); 00156 CABOCHA_DLL_EXTERN cabocha_charset_t cabocha_tree_charset(cabocha_tree_t* tree); 00157 CABOCHA_DLL_EXTERN void cabocha_tree_set_posset(cabocha_tree_t* tree, 00158 cabocha_posset_t posset); 00159 CABOCHA_DLL_EXTERN cabocha_posset_t cabocha_tree_posset(cabocha_tree_t* tree); 00160 CABOCHA_DLL_EXTERN void cabocha_tree_set_output_layer(cabocha_tree_t* tree, 00161 cabocha_output_layer_t output_layer); 00162 CABOCHA_DLL_EXTERN cabocha_output_layer_t cabocha_tree_output_layer(cabocha_tree_t* tree); 00163 00164 CABOCHA_DLL_EXTERN int cabocha_learn(int argc, char **argv); 00165 CABOCHA_DLL_EXTERN int cabocha_system_eval(int argc, char **argv); 00166 CABOCHA_DLL_EXTERN int cabocha_model_index(int argc, char **argv); 00167 #endif 00168 00169 #ifdef __cplusplus 00170 } 00171 #endif 00172 00173 /* for C++ */ 00174 #ifdef __cplusplus 00175 00176 namespace CaboCha { 00177 00178 class Tree; 00179 typedef struct cabocha_chunk_t Chunk; 00180 typedef struct cabocha_token_t Token; 00181 00182 typedef enum cabocha_charset_t CharsetType; 00183 typedef enum cabocha_posset_t PossetType; 00184 typedef enum cabocha_format_t FormatType; 00185 typedef enum cabocha_input_layer_t InputLayerType; 00186 typedef enum cabocha_output_layer_t OutputLayerType; 00187 typedef enum cabocha_parser_t ParserType; 00188 00189 class TreeAllocator; 00190 00191 class Tree { 00192 public: 00193 void set_sentence(const char *sentence); 00194 const char *sentence() const; 00195 size_t sentence_size() const; 00196 00197 #ifndef SWIG 00198 void set_sentence(const char *sentence, size_t length); 00199 #endif 00200 00201 const Chunk *chunk(size_t i) const; 00202 const Token *token(size_t i) const; 00203 00204 #ifndef SWIG 00205 Chunk *mutable_chunk(size_t i); 00206 Token *mutable_token(size_t i); 00207 00208 Token *add_token(); 00209 Chunk *add_chunk(); 00210 00211 char *strdup(const char *str); 00212 char *alloc(size_t size); 00213 char **alloc_char_array(size_t size); 00214 00215 TreeAllocator *allocator() const; 00216 #endif 00217 00218 bool read(const char *input, 00219 InputLayerType input_layer); 00220 00221 #ifndef SWIG 00222 bool read(const char *input, size_t length, 00223 InputLayerType input_layer); 00224 bool read(const mecab_node_t *node); 00225 #endif 00226 00227 bool empty() const; 00228 void clear(); 00229 void clear_chunk(); 00230 00231 size_t chunk_size() const; 00232 size_t token_size() const; 00233 size_t size() const; 00234 00235 const char *toString(FormatType output_format); 00236 00237 #ifndef SWIG 00238 const char *toString(FormatType output_format, 00239 char *output, size_t length) const; 00240 #endif 00241 00242 CharsetType charset() const { return charset_; } 00243 void set_charset(CharsetType charset) { charset_ = charset; } 00244 PossetType posset() const { return posset_; } 00245 void set_posset(PossetType posset) { posset_ = posset; } 00246 OutputLayerType output_layer() const { return output_layer_; } 00247 void set_output_layer(OutputLayerType output_layer) { output_layer_ = output_layer; } 00248 00249 const char *what(); 00250 00251 explicit Tree(); 00252 virtual ~Tree(); 00253 00254 private: 00255 TreeAllocator *tree_allocator_; 00256 CharsetType charset_; 00257 PossetType posset_; 00258 OutputLayerType output_layer_; 00259 }; 00260 00261 class Parser { 00262 public: 00263 virtual const Tree *parse(const char *input) = 0; 00264 virtual const char *parseToString(const char *input) = 0; 00265 virtual const Tree *parse(Tree *tree) const = 0; 00266 00267 #ifndef SWIG 00268 virtual const Tree *parse(const char *input, size_t length) = 0; 00269 virtual const char *parseToString(const char *input, size_t length) = 0; 00270 virtual const char *parseToString(const char *input, size_t length, 00271 char *output, size_t output_length) = 0; 00272 #endif 00273 00274 virtual const char *what() = 0; 00275 static const char *version(); 00276 00277 virtual ~Parser() {} 00278 00279 #ifndef SWIG 00280 static Parser *create(int argc, char **argv); 00281 static Parser *create(const char *arg); 00282 #endif 00283 }; 00284 00285 CABOCHA_DLL_EXTERN Parser *createParser(int argc, char **argv); 00286 CABOCHA_DLL_EXTERN Parser *createParser(const char *arg); 00287 CABOCHA_DLL_EXTERN const char *getParserError(); 00288 CABOCHA_DLL_EXTERN const char *getLastError(); 00289 } 00290 #endif 00291 #endif