hive tips

hive1.1的jline问题

删除$HADOOP_HOME/share/hadoop/yarn/lib/jline-0.9.94.jar

mysql日志写入问题

ubuntu在/etc/mysql/my.cnf加入以下内容

[mysqld]                                                                                                                
# for hive                                                                                                              
binlog_format=mixed 

hive导出csv,默认用"\t"分隔数据

INSERT OVERWRITE LOCAL DIRECTORY '/home/laomie/init.csv' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' select * from android_init

hive --database game -e 'select * from android_init' | sed 's/[[:space:]]\+/,/g' > /home/laomie/init.csv

hive导出表结构

hive --database game -e 'show create table android_init' > android_init.sql

hive的slf4j

rm -fr $HADOOP_HOME/share/hadoop/common/lib/slf4j*
mv $HBASE_HOME/lib/slf4j* $HADOOP_HOME/share/hadoop/common/lib
jar -xvf $SPARK_HOME/lib/spark-assembly*.jar
# delete slf4j path and zip the directory
jar -cvf $SPARK_HOME/lib/spark-assembly-1.4.0-hadoop2.6.0.jar $SPARK_HOME/lib/spark-assembly-1.4.0-hadoop2.6.0/

hive的timestamp时间差

CREATE TABLE ts (txt string, st Timestamp, et Timestamp) 
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',';

select 
  txt, 
  cast(
    round(
      cast((e-s) as double) * 1000
    ) as int
  ) latency 
from (select txt, cast(st as double) s, cast(et as double) e from ts) q;

hive删除表

truncate table table_name;
truncate table table_name partition (dt = '2015-12-02');
alter table table_name drop if exists partition (dt = '2015-12-02');

变更机器名

因为hive的metadata中包括机器名,当机器名变更后,需要改metadata里的机器名。可用以下命令查找并修改机器名

# 查找表的hdfs路径包括机器名
hive> describe formatted [tablename];
# 修改表的hdfs路径
hive> alter table [tablename] set location "hdfs://[newhostname]:8020/user/hive/warehouse/[tablename]"; 

建外部表

将spark的dataframe保存orc文件

df.write.mode(SaveMode.Overwrite).format("orc").save("/user/laomie/country")

hive用外部表关联orc文件

CREATE EXTERNAL TABLE country_temp (country_id int, country string, last_update timestamp)
STORED AS ORC LOCATION "/user/laomie/country";

移动表

alter table table_name rename to new_database.table_name

变更列

ALTER TABLE table_name CHANGE old_col_name new_col_name new_data_type
-- 变更分区列
ALTER TABLE {table_name} PARTITION COLUMN ({column_name} {column_type});
MSCK REPAIR TABLE <tablename>;

Hive在spark2.1启动时无法访问../lib/spark-assembly-*.jar

编辑$HIVE_HOME/bin/hive

sparkAssemblyPath=`ls ${SPARK_HOME}/jars/spark-*.jar`

Hive连接远程metastore

node2结点编辑$HIVE_HOME/conf/hive-site.xml

    <property>
       <name>hive.metastore.uris</name>
       <value>thrift://node1:9083</value>
    </property>

node1结点启动服务

hive --service metastore

hive删除空分区

需要将分区列转为字符型才能删除

ALTER TABLE ssp_log_data PARTITION COLUMN (log_hour STRING);
ALTER TABLE ssp_log_data DROP PARTITION (log_hour='__HIVE_DEFAULT_PARTITION__');
ALTER TABLE ssp_log_data PARTITION COLUMN (log_hour INT);

Hive<找不到Spark-assemblyJar包>

编辑"$HIVE_HOME/bin/hive

sparkAssemblyPath=`ls ${SPARK_HOME}/lib/spark-assembly-*.jar`
改为
sparkAssemblyPath=`ls ${SPARK_HOME}/jars/*.jar`

hive on spark

hive链接spark相关jar

ln -sf /home/laomie/tools/spark/jars/scala-library-2.11.8.jar /home/laomie/tools/hive/lib/scala-library-2.11.8.jar
ln -sf /home/laomie/tools/spark/jars/spark-core_2.11-2.2.2.jar /home/laomie/tools/hive/lib/spark-core_2.11-2.2.2.jar
ln -sf /home/laomie/tools/spark/jars/spark-network-common_2.11-2.2.2.jar /home/laomie/tools/hive/lib/spark-network-common_2.11-2.2.2.jar

spark-defaults.conf复制到hive目录

上传spark的jar到hdfs

hdfs dfs -mkdir /spark-jars
hdfs dfs -put /home/laomie/tools/spark/jars/*.jar /spark-jars/

在hive的"hive-site.xml"增加下列内容

<property>                                                                                                                        
    <name>spark.yarn.jars</name>                                                                                                    
    <value>hdfs://localhost:9000/spark-jars/*</value>                                                                               
</property>

hive on spark

set hive.execution.engine=spark;

links

social