建表语句释义
语法帮助
CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name -- (Note: TEMPORARY available in Hive 0.14.0 and later)
[(col_name data_type [column_constraint_specification] [COMMENT col_comment], ... [constraint_specification])]
[COMMENT table_comment]
[PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)]
[CLUSTERED BY (col_name, col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS]
[SKEWED BY (col_name, col_name, ...) -- (Note: Available in Hive 0.10.0 and later)]
ON ((col_value, col_value, ...), (col_value, col_value, ...), ...)
[STORED AS DIRECTORIES]
[
[ROW FORMAT row_format]
[STORED AS file_format]
| STORED BY 'storage.handler.class.name' [WITH SERDEPROPERTIES (...)] -- (Note: Available in Hive 0.6.0 and later)
]
[LOCATION hdfs_path]
[TBLPROPERTIES (property_name=property_value, ...)] -- (Note: Available in Hive 0.6.0 and later)
[AS select_statement]; -- (Note: Available in Hive 0.5.0 and later; not supported for external tables)
CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
LIKE existing_table_or_view_name
[LOCATION hdfs_path];
data_type
: primitive_type
| array_type
| map_type
| struct_type
| union_type -- (Note: Available in Hive 0.7.0 and later)
primitive_type
: TINYINT
| SMALLINT
| INT
| BIGINT
| BOOLEAN
| FLOAT
| DOUBLE
| DOUBLE PRECISION -- (Note: Available in Hive 2.2.0 and later)
| STRING
| BINARY -- (Note: Available in Hive 0.8.0 and later)
| TIMESTAMP -- (Note: Available in Hive 0.8.0 and later)
| DECIMAL -- (Note: Available in Hive 0.11.0 and later)
| DECIMAL(precision, scale) -- (Note: Available in Hive 0.13.0 and later)
| DATE -- (Note: Available in Hive 0.12.0 and later)
| VARCHAR -- (Note: Available in Hive 0.12.0 and later)
| CHAR -- (Note: Available in Hive 0.13.0 and later)
array_type
: ARRAY < data_type >
map_type
: MAP < primitive_type, data_type >
struct_type
: STRUCT < col_name : data_type [COMMENT col_comment], ...>
union_type
: UNIONTYPE < data_type, data_type, ... > -- (Note: Available in Hive 0.7.0 and later)
row_format
: DELIMITED [FIELDS TERMINATED BY char [ESCAPED BY char]] [COLLECTION ITEMS TERMINATED BY char]
[MAP KEYS TERMINATED BY char] [LINES TERMINATED BY char]
[NULL DEFINED AS char] -- (Note: Available in Hive 0.13 and later)
| SERDE serde_name [WITH SERDEPROPERTIES (property_name=property_value, property_name=property_value, ...)]
file_format:
: SEQUENCEFILE
| TEXTFILE -- (Default, depending on hive.default.fileformat configuration)
| RCFILE -- (Note: Available in Hive 0.6.0 and later)
| ORC -- (Note: Available in Hive 0.11.0 and later)
| PARQUET -- (Note: Available in Hive 0.13.0 and later)
| AVRO -- (Note: Available in Hive 0.14.0 and later)
| JSONFILE -- (Note: Available in Hive 4.0.0 and later)
| INPUTFORMAT input_format_classname OUTPUTFORMAT output_format_classname
column_constraint_specification:
: [ PRIMARY KEY|UNIQUE|NOT NULL|DEFAULT [default_value]|CHECK [check_expression] ENABLE|DISABLE NOVALIDATE RELY/NORELY ]
default_value:
: [ LITERAL|CURRENT_USER()|CURRENT_DATE()|CURRENT_TIMESTAMP()|NULL ]
constraint_specification:
: [, PRIMARY KEY (col_name, ...) DISABLE NOVALIDATE RELY/NORELY ]
[, PRIMARY KEY (col_name, ...) DISABLE NOVALIDATE RELY/NORELY ]
[, CONSTRAINT constraint_name FOREIGN KEY (col_name, ...) REFERENCES table_name(col_name, ...) DISABLE NOVALIDATE
[, CONSTRAINT constraint_name UNIQUE (col_name, ...) DISABLE NOVALIDATE RELY/NORELY ]
[, CONSTRAINT constraint_name CHECK [check_expression] ENABLE|DISABLE NOVALIDATE RELY/NORELY ]
[EXTERNAL]:声明为外部表,往往在该表需要被多个工具共享时声明,外部表删表不会删数据,只会删元数据。
col_name datatype:data_type 一定要严谨定义,避免 bigint、double 等等统统用 string 的偷懒做法,否则不知某天数据就出错了。
[if not exists]:创建时不指定,若存在同名表则返回出错。指定此选项,若存在同名表忽略后续,不存在则创建。
[DEFAULT value]:指定列的默认值,当INSERT操作不指定该列时,该列写入默认值。
[PARTITIONED BY]:指定表的分区字段,当利用分区字段对表进行分区时,新增分区、更新分区内数据和读取分区数据均不需做全表扫描,可以提高处理效率。
[LIFECYCLE]:是表的生命周期,分区表则每个分区的生命周期与表生命周期相同
[AS select_statement]:意味着可直接跟 select 语句插入数据
0: jdbc:hive2://localhost:10000/> create table if not exists dept
. . . . . . . . . . . . . . . . > (deptno int comment 'department number ',
. . . . . . . . . . . . . . . . > dname string comment 'department name ',
. . . . . . . . . . . . . . . . > loc string comment 'department location' )
. . . . . . . . . . . . . . . . > comment 'department info table '
. . . . . . . . . . . . . . . . > row format delimited
. . . . . . . . . . . . . . . . > fields terminated by '\t'
. . . . . . . . . . . . . . . . > collection items terminated by ','
. . . . . . . . . . . . . . . . > map keys terminated by ':'
. . . . . . . . . . . . . . . . > lines terminated by '\n';
row format delimited : 用来声明各类分隔符
fields terminated by ‘\t’ : 声明表中列间分隔符
collection items terminated by ‘,’ :声明复杂类型元素分隔符
map keys terminated by ‘:’:声明 MAP类型分隔符
lines terminated by ‘\n’;:声明行间分隔符
数据类型
加载数据
从本地加载数据
0: jdbc:hive2://localhost:10000/> load data local inpath '/root/data.txt' into table dept ;
No rows affected (1.725 seconds)
0: jdbc:hive2://localhost:10000/> select * frrom dept ;
Error: Error while compiling statement: FAILED: ParseException line 1:9 missing EOF at 'frrom' near '*' (state=42000,code=40000)
0: jdbc:hive2://localhost:10000/> select * from dept ;
+--------------+-------------+-----------+--+
| dept.deptno | dept.dname | dept.loc |
+--------------+-------------+-----------+--+
| 1 | IT | BJ |
| 2 | MARK | SH |
| 3 | SALES | SZ |
+--------------+-------------+-----------+--+
3 rows selected (1.877 seconds)
从HDFS加载数据
0: jdbc:hive2://localhost:10000/> load data inpath '/user/hive/warehouse/d2.db/dept/data.txt' into table dept_2 ;
No rows affected (0.852 seconds)
0: jdbc:hive2://localhost:10000/> select * from dept_2 ;
+----------------+---------------+-------------+--+
| dept_2.deptno | dept_2.dname | dept_2.loc |
+----------------+---------------+-------------+--+
| 1 | IT | BJ |
| 2 | MARK | SH |
| 3 | SALES | SZ |
+----------------+---------------+-------------+--+
3 rows selected (0.336 seconds)
查看加载后的数据
[root@node1 ~]# hdfs dfs -ls -R /user/hive/warehouse/
drwxr-xr-x - root supergroup 0 2020-08-25 09:18 /user/hive/warehouse/d2.db
drwxr-xr-x - root supergroup 0 2020-08-25 09:21 /user/hive/warehouse/d2.db/dept
drwxr-xr-x - root supergroup 0 2020-08-25 09:21 /user/hive/warehouse/d2.db/dept_2
-rwxr-xr-x 3 root supergroup 29 2020-08-25 05:57 /user/hive/warehouse/d2.db/dept_2/data.txt
drwxr-xr-x - root supergroup 0 2020-08-24 16:32 /user/hive/warehouse/d2.db/t1
-rwxr-xr-x 3 root supergroup 2 2020-08-24 16:32 /user/hive/warehouse/d2.db/t1/000000_0
drwxr-xr-x - root supergroup 0 2020-08-25 05:46 /user/hive/warehouse/d2.db/t3
drwxr-xr-x - root supergroup 0 2020-08-25 05:47 /user/hive/warehouse/d2.db/t4
drwxr-xr-x - root supergroup 0 2020-08-22 02:58 /user/hive/warehouse/dao_t1
-rwxr-xr-x 3 root supergroup 11 2020-08-22 02:57 /user/hive/warehouse/dao_t1/000000_0
诡异的事情出现了,data.txt从dept目录下,迁移至dept2目录下。 我们查看一下dept表,发现数据也没有了。
0: jdbc:hive2://localhost:10000/> select * from dept ;
+--------------+-------------+-----------+--+
| dept.deptno | dept.dname | dept.loc |
+--------------+-------------+-----------+--+
+--------------+-------------+-----------+--+
No rows selected (0.953 seconds)
分区表的创建
0: jdbc:hive2://localhost:10000/> create table dao_part (
. . . . . . . . . . . . . . . . > userid string ,
. . . . . . . . . . . . . . . . > login_channel int ,
. . . . . . . . . . . . . . . . > device_type string
. . . . . . . . . . . . . . . . > ) PARTITIONED by (month int , day int ) ;
No rows affected (0.563 seconds)
向分区表中插入数据
0: jdbc:hive2://localhost:10000/> insert into dao_part PARTITION (month=202008, day=03) values
. . . . . . . . . . . . . . . . > ('user1',2,'ios' ) ;
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
No rows affected (31.873 seconds)
0: jdbc:hive2://localhost:10000/> insert into dao_part PARTITION (month=201907, day=07) values
. . . . . . . . . . . . . . . . > ('user1',2,'anroid' ) ;
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
No rows affected (22.594 seconds)
0: jdbc:hive2://localhost:10000/> select * from dao_part ;
+------------------+-------------------------+-----------------------+-----------------+---------------+--+
| dao_part.userid | dao_part.login_channel | dao_part.device_type | dao_part.month | dao_part.day |
+------------------+-------------------------+-----------------------+-----------------+---------------+--+
| user1 | 2 | anroid | 201907 | 7 |
| user1 | 2 | ios | 202008 | 3 |
+------------------+-------------------------+-----------------------+-----------------+---------------+--+
查看分区
0: jdbc:hive2://localhost:10000/> SHOW PARTITIONS dao_part ;
+---------------------+--+
| partition |
+---------------------+--+
| month=201907/day=7 |
| month=202008/day=3 |
+---------------------+--+
加载数据值分区表
制作测试数据文本
[root@node1 ~]# cat -A data_user.txt
user3^A3^Aweb$
user4^A4^Amac$
加载数据
0: jdbc:hive2://localhost:10000/> load data local inpath '/root/data_user.txt' into table dao_part partition(month=202008,day=03);load data local inpath '/root/data_user.txt' into table dao_part partition(month=202008,day=03);
No rows affected (1.26 seconds)
No rows affected (1.229 seconds)
查看数据
0: jdbc:hive2://localhost:10000/> select * from dao_part ;
+------------------+-------------------------+-----------------------+-----------------+---------------+--+
| dao_part.userid | dao_part.login_channel | dao_part.device_type | dao_part.month | dao_part.day |
+------------------+-------------------------+-----------------------+-----------------+---------------+--+
| user1 | 2 | anroid | 201907 | 7 |
| user1 | 2 | ios | 202008 | 3 |
| user3 | 3 | web | 202008 | 3 |
| user4 | 4 | mac | 202008 | 3 |
| user3 | 3 | web | 202008 | 3 |
| user4 | 4 | mac | 202008 | 3 |
+------------------+-------------------------+-----------------------+-----------------+---------------+--+
6 rows selected (0.358 seconds)
查看相关路径
[root@node1 ~]# hdfs dfs -ls -R /user/hive/warehouse/d2.db/dao_part/month=202008/day=3/data_user_copy_1.txt
-rwxr-xr-x 3 root supergroup 24 2020-08-25 11:31 /user/hive/warehouse/d2.db/dao_part/month=202008/day=3/data_user_copy_1.txt
[root@node1 ~]# hdfs dfs -cat /user/hive/warehouse/d2.db/dao_part/month=202008/day=3/data_user_copy_1.txt
user33web
user44mac
桶表
如果说分区表对应Oracle数据库的列表分区表,那么桶表就对应Oracle中的哈希分区表。
建立桶表
0: jdbc:hive2://localhost:10000/> create table dao_part_bucket (
. . . . . . . . . . . . . . . . > userid string ,
. . . . . . . . . . . . . . . . > login_channel int ,
. . . . . . . . . . . . . . . . > device_type string
. . . . . . . . . . . . . . . . > ) clustered by (login_channel )
. . . . . . . . . . . . . . . . > into 3 buckets ;
No rows affected (0.212 seconds)
查看桶表
0: jdbc:hive2://localhost:10000/> show create table dao_part_bucket ;
+-----------------------------------------------------------------+--+
| createtab_stmt |
+-----------------------------------------------------------------+--+
| CREATE TABLE `dao_part_bucket`( |
| `userid` string, |
| `login_channel` int, |
| `device_type` string) |
| CLUSTERED BY ( |
| login_channel) |
| INTO 3 BUCKETS |
| ROW FORMAT SERDE |
| 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' |
| STORED AS INPUTFORMAT |
| 'org.apache.hadoop.mapred.TextInputFormat' |
| OUTPUTFORMAT |
| 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' |
| LOCATION |
| 'hdfs://mycluster/user/hive/warehouse/d2.db/dao_part_bucket' |
| TBLPROPERTIES ( |
| 'COLUMN_STATS_ACCURATE'='{\"BASIC_STATS\":\"true\"}', |
| 'numFiles'='0', |
| 'numRows'='0', |
| 'rawDataSize'='0', |
| 'totalSize'='0', |
| 'transient_lastDdlTime'='1598372677') |
+-----------------------------------------------------------------+--+
22 rows selected (0.634 seconds)
插入数据
0: jdbc:hive2://localhost:10000/> insert into dao_part_bucket (userid,login_channel,device_type) select userid,login_channel,device_type from dao_part ;
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
No rows affected (64.268 seconds)
0: jdbc:hive2://localhost:10000/> select * from dao_part_bucket ;
+-------------------------+--------------------------------+------------------------------+--+
| dao_part_bucket.userid | dao_part_bucket.login_channel | dao_part_bucket.device_type |
+-------------------------+--------------------------------+------------------------------+--+
| user3 | 3 | web |
| user3 | 3 | web |
| user4 | 4 | mac |
| user4 | 4 | mac |
| user1 | 2 | ios |
| user1 | 2 | anroid |
+-------------------------+--------------------------------+------------------------------+--+
查看表目录
[root@node1 ~]# hdfs dfs -ls -R /user/hive/warehouse/d2.db/dao_part_bucket
-rwxr-xr-x 3 root supergroup 24 2020-08-25 12:40 /user/hive/warehouse/d2.db/dao_part_bucket/000000_0
-rwxr-xr-x 3 root supergroup 24 2020-08-25 12:39 /user/hive/warehouse/d2.db/dao_part_bucket/000001_0
-rwxr-xr-x 3 root supergroup 27 2020-08-25 12:40 /user/hive/warehouse/d2.db/dao_part_bucket/000002_0
查看各个文件内的数据
[root@node1 ~]# hdfs dfs -cat /user/hive/warehouse/d2.db/dao_part_bucket/000000_0
user33web
user33web
[root@node1 ~]# hdfs dfs -cat /user/hive/warehouse/d2.db/dao_part_bucket/000001_0
user44mac
user44mac
[root@node1 ~]# hdfs dfs -cat /user/hive/warehouse/d2.db/dao_part_bucket/000002_0
user12ios
user12anroid