Bigdata@master: 4.set The Environmental Variable HIVE - HOME in Bashrc File
Bigdata@master: 4.set The Environmental Variable HIVE - HOME in Bashrc File
Bigdata@master: 4.set The Environmental Variable HIVE - HOME in Bashrc File
Such as reading
randomly, insert /update/delete records randomly.
HIVE INSTALATION
export HIVE_HOME=/home/matuser/work/hive-0.14.0
export PATH=$PATH:$HIVE_HOME/bin;
Bigdata@master:~$hadoop fs -mkdir
/user/hive/wareshouse/
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/user/hive/warehouse</value>
<description>location of default database for the
warehouse</description>
</property>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:derby:;databaseName=metastore_db;create
=true</value>
<description>JDBC connect string for a JDBC
metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>org.apache.derby.jdbc.EmbeddedDriver</value>
<description>Driver class name for a JDBC
metastore</description>
</property>
derby data can't be shared between hive users. To share metadata
b/w hive clients configure external database(like mysql).
<value>jdbc:mysql://localhost:3306/metastore_db</val
ue>
<description>JDBC connect string for a JDBC
metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
<description>Driver class name for a JDBC
metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>matuser</value>
<description>username to use against metastore
database</description>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>matuser</value>
<description>password to use against metastore
database</description>
</property>
create
alter
drop
In hive ,
/user/hive/warehouse
Ex:
if you create table with the name table1,then table1 directory will
be created in following path.
/user/hive/warehouse/table1
2)SmallInt(2 bytes)
3)int(4 bytes)
4)Bigint(8 bytes)
5)float(4 bytes)
6)double(8 bytes)
7)Decimal (hive-0.11.0)
8)Date(hive-0.12.0)
9)Timestamp(hive-0.8.0)
12)char(hive-0.13.0)
14)Boolean --->true/false
15)Binary-->hive-0.8.0
Hive tables:-
Hadoop
Pig
Hive
^d
hadoop
pig
hive
ex:2
hbase
oozie
flume
^d
o/p
/user/hive/warehouse/sample/file1
/user/hive/warehouse/sample/file2
Dropping a table
ex:
/user/hive/warehouse/extab
In hdfs
/user/hive/warehouse/extab/file1
hello
pig
hive
/user/hive/warehouse/extab/file1
hadoop
pig
hive
Both inner and external tables can use custom hdfs location.
Ex:
In hdfs /user/matuser/file1
Ex:
$cat >sample1
1
^d
$cat >sample2
^d
$cat >sample3
^d
$hadoop fs –copyFromLocal samp3
/user/hive/warehouse/sample
o/p
2 from samp1
5 from samp2
8 from samp3
$cat >sample4
10
11
12
^d
Hive>load data local inpath 'sample4' overwrite into
table sample;
10
11
12
……Sample4
…..sample4_copy_1
10
11 from sample4
12
10
11 from sample4-copy-1
12
$cat>test1
100,200,300
400,500,600
700,800,900
^d
Queries
101,srinivas,20000,M,1001
102,satya,50000,M,1002
103,sailaja,25000,F,1002
104,amit,30000,M,1003
104,saradha,50000,F,1004
105,anusha,45000,F,1003
106,krishna,60000,M,1005
107,rajesh,42000,M,1002
108,mohit,40000,M,1001
109,bhargavi,45000,F,1004
110,amrutha,35000,F,1005
hive>create table emp(ecode int,name string,salary
int,sex string,deptno int)row format delimited fields
terminated by ',';
or
((deptno=1004
o/p
all males of 1001,1005
or deptno=1002 or deptno=1003;
or
order by:
select ecode,name,salary,sex,deptno,salary*0.1
from emp;
o/p
101 satya 50000 M 1001 5000
updating a table
select ecode,name,salary+
(salary*0.1),sex,deptno,tax from emp;
if function
if(condition,value1,value2)
ex:
100,200
300,400
644,545
789,456
^d
nested if statements;
$cat >file2
100,200,230
300,400,75
644,545,20
789,456,98
xxxxx2010xxx20xx
xxxxx2010xxx21xx
xxxxx2011xxx27xx
xxxxx2011xxx23xx
^d
2010 20
2010 21
2010 20
2011 27
2011 23
6start position
4length of substring
2011 27
Overwrite is mandatory
$cat >tmpr.txt
xxxxx2010xxx20xx
xxxxx2010xxx-15xx
xxxxx2010xxx-10xx
xxxxx2010xxx23xx
xxxxx2011xxx-15xx
xxxxx2011xxx-10xx
^d
if positive , length is 2
if negative length is 3
where substr(line,13,1)!='-'
union all
o/p
2010 20
2010 -15
2010 -10
2010 23
2011 -15
2011 -11
select substr(line,6,4),
if(substr(line,13,1)!='-',
o/p
2010 20
2010 -15
2010 -10
.........
group by y;
0/p
2012 23 -15
---------------------------------------------------------------------
-------------
Bigdata@Bigdata:~$ cat>student
Kalam,20,Btech*Mtech*PHD
praneeth,21,Bsc*Msc*Mtech
satya,38,Bsc*Msc*PHD
gagan 20 ["btech","mtech","phd"]
praveen 21 ["bsc","msc","mtech"]
Btech
Bsc
$ cat >trans
p101,pendrive$400*camera$5000*mobile$8500
p102,Tv$30000*CDPlayer$10000*pendrive$400*compute
r$25000
p102
{"Tv":30000,"CDPlayer":10000,"pendrive":800,"computer
",25000}
o/p
p101 400
p102 800
$ cat >profile
gagan datta,27,manaswi#23#hyd,hyd
krishna,30,saradha#25#hyd,hyd
^d
gagan datta 24
{"name":"manaswi","age":24,"city":"del"}hyd
gagan datta 24
{"name":"saradha","age":24,"city":"del"}hyd
hive>select name ,wife.name from profile;
krishna saradha
Like:
$cat >file3;
100,,200
200,,
,,700
150,240,
,130,140
175,300,250
^d
10 20 NULL
NULL 30 40
100 50 25
NULL
NULL
NULL
NULL
NULL
NULL
175
becase
100+200=300
NULL+20 =NULL
200-NULL=NULL
100 0 200
200 0 300
0 0 300
0 300 0
10 20 0
0 30 40
100 50 25
300
500
300
30
70
175
ex:-
101,srinivas,20000,M,1001
102,satya,50000,M,1002
103,sailaja,25000,F,1002
104,amit,30000,M,1003
104,saradha,50000,F,1004
105,anusha,45000,F,1003
106,krishna,60000,M,1005
107,rajesh,42000,M,1002
108,mohit,40000,M,1001
109,bhargavi,45000,F,1004
110,amrutha,35000,F,1005
in sex colmn
1002 -------> HR
salary---->grade
salary<30000 ---> D
salary>=70000 ---->A
select ecode,name,sal,
if(sex='F','female','male'),
if(dno =1001,'marketing',
if(dno=1002,'HR',
if(dno=1003,'Finance','others'))),
if(sal>=70000 ,'A',
if(sal>=50000,'B',
if(sal>=30000,'C','D')))
from emp;
101,srinivas,20000,Male,marketing
102,satya,50000,Male,HR
103,sailaja,25000,Female,HR
104,amit,30000,Male,Finance
104,saradha,50000,Female,others
105,anusha,45000,Female,Finance
.......
else 'others'
end
from emp;
else 'D'
end
from emp;
or
ex3:
hive>select
deptno,sum(salary),avg(salary),min(salary),max(salary)
,count(*) from emp group by deptno;
having clause:
o/p
1002 117000
1004 95000
1005 95000
Eliminating duplicates:
TAB
10
20
20 o/p
20 10
10 20
30
merging tables
UNION ALL
table -->emp1
............
...........
1000 rows
table -->emp2
................
.............
500 rows
ex: merge emp1,emp2 into emp
select * from(
union all
emp1:eid,ename,salary,sex,deptno
emp2:eid,ename,deptno,salary,sex
select * from(
union all
emp1:eid,ename,salary,sex,deptno,designation
emp2:eid,ename,salary,deptno,city.
select * from(
union all
o/p
model1:
union all
group by dno;
model2:
hive>select dno,sum(tot) as tot from(
group by dno
union all
group by dno)e
group by dno;
1)Inner joins
2)Outer Joins
Inner join
A B
1 3
2 4
3 5
4 8
5 9
o/p 3,3
4,4
5,5
outer joins
o/p
1 null
2 null
3,3
4,4
5,5
Rigtht outer join:-
3,3
4,4
5,5
null,8
null,9
all matching rows and non matching rows of left and right
table
3,3
4,4
5,5
1,null
2,null
null,8
null,9
Emp
eid ename salary sex deptno
Dept
1003 HR Chennai
InnerJoin
o/p
delhi 100000
hyderabad 125000
denormalization :
now with out appying joins every time you can directly
query on empdept.
sales datasets
$cat sales
13-10-2010 261.54
01-10-2012 10123.02
01-10-2012 244.57
10-07-2011 4965.7595
28-08-2010 394.27
28-08-2010 146.69
17-06-2011 93.54
17-06-2011 905.08
24-03-2011 2781.82
26-02-2010 228.41
23-11-2010 196.85
23-11-2010 124.56
08-06-2012 716.84
08-06-2012 1474.33
04-08-2012 80.61
01-01-2001 3000.89
............
2001-01-01 3000.78
.................
.................
2014-12-31 200.09
now date is in hive date format then you can apply all
hive date functions
month('2001-01-05') o/p 1
day('2001-01-05') o/p 5
group by year(dt);
group by day(dt);
select dt,amt,case
else 4
group by year(dt),halfyear;
quarter totamt
1 50000
2 75000
3 60000
4 90000
select l.quarter,l.totamt,r.quater,r.totamt
from quarterly-report2010 l;
join quarterly-report2010 r;
q1 tot1 q2 tot2
1 50000 1 50000
1 50000 2 75000
1 50000 3 60000
1 50000 4 90000
........
where quarter1-quarter2=1;
q1 tot1 q2 tot2
2 75000 1 50000
3 60000 2 75000
4 90000 3 60000
select q1 ,tot1,q2,tot2
Classification of tables
1) partitioned tables
The Hive was introduced to lower down this burden of data querying. Apache Hive
converts the SQL queries into MapReduce jobs and then submits it to the Hadoop
Cluster. When we submit a SQL query, Hive read the entire data-set. So, it
becomes inefficient to run MapReduce jobs over a large table. Thus this is
resolved by creating partitions in tables. Apache Hive makes this job of
implementing partitions very easy by creating partitions by its automatic partition
scheme at the time of table creation.
In Partitioning method, all the table data is divided into multiple partitions. Each
partition corresponds to a specific value(s) of partition column(s). It is kept as a sub-
record inside the table’s record present in the HDFS. Therefore on querying a
particular table, appropriate partition of the table is queried which contains the query
value. Thus this decreases the I/O time required by the query. Hence increases the
performance speed.
Suppose we need to retrieve the details of all the clients who joined in 2012. Then,
the query searches the whole table for the required information. But if we partition
the client data with the year and store it in a separate file, this will reduce the query
processing time.
example will help us to learn how to partition a file and its data-
tab1/clientdata/2009/file2
tab1/clientdata/2010/file3
Now when we are retrieving the data from the table, only the data of the specified
partition will be queried. Creating a partitioned table is as follows:
CREATE TABLE table_tab1 (id INT, name STRING, dept STRING, yoj INT) PARTITIONED BY (year STRING)
row format delimited fields terminated by ',';
load data local inpath 'file:///home/madhu/2009_f1' OVERWRITE into table table_tab1 PARTITION(year='2009');
load data local inpath 'file:///home/madhu/2010_f2' OVERWRITE into table table_tab1 PARTITION(year='2010');
● Static Partitioning
● Dynamic Partitioning
● Usually when loading files (big files) into Hive tables static partitions
are preferred.
● You “statically” add a partition in the table and move the file into the
partition of the table.
● You can get the partition column value from the filename, day of date
etc without reading the whole big file.
● If you want to use the Static partition in the hive you should set
property set hive.mapred.mode = strict This property set by default
in hive-site.xml
● You should use where clause to use limit in the static partition.
● When you have large data stored in a table then the Dynamic partition
is suitable.
● If you want to partition a number of columns but you don’t know how
many columns then also dynamic partition is suitable.
● You can perform dynamic partition on hive external table and managed
table.
● If you want to use the Dynamic partition in the hive then the mode is in
non-strict mode.
● There is the possibility of too many small partition creations- too many
directories.
● Partition is effective for low volume data. But there some queries like
group by on high volume of data take a long time to execute. For
example, grouping population of China will take a long time as
compared to a grouping of the population in Vatican City.
● There is no need for searching entire table column for a single record.
Bucketing – In Hive Tables or partition are subdivided into buckets based on the
hash function of a column in the table to give extra structure to the data that may be
used for more efficient queries.
Each bucket in Hive is just a file in the table directory (unpartitioned table) or the
partition directory. So, you have chosen to divide the partitions into n buckets. Then
you will have n files in each of your partition directories. Hence, from the above
diagram, you can see, where you have bucketed each partition into 2 buckets.
Therefore each partition, say EEE, will have two files where each of them will be
storing the EEE student’s data.
Case Study on Partitioning and Bucketing
Client having Some E –commerce data which belongs to India operations in which each
state (38 states) operations mentioned in as a whole. If we take state column as partition
key and perform partitions on that India data as a whole, we can able to get Number of
partitions (38 partitions) which is equal to number of states (38) present in India. Such
that each state data can be viewed separately in partitions tables.
Creation of Table all states
set hive.exec.dynamic.partition.mode=nonstrict
Actual processing and formation of partition tables based on state as partition key
There are going to be 38 partition outputs in HDFS storage with the file name as state
name. We will check this in this step
The following screen shots will show u the execution of above mentioned code
Creation of table all states with 3 column names such as
state, district, and enrollment
● The data i.e. present in that partitions can be divided further into
Buckets
● The division is performed based on Hash of particular columns that
we selected in the table.
● Buckets use some form of Hashing algorithm at back end to read
each record and place it into buckets
● In Hive, we have to enable buckets by using the
set.hive.enforce.bucketing=true;
Before we start moving employees data into buckets, make sure that it consist of
column names such as first_name, job_id, department, salary and country.
Here we are loading data into sample bucket from employees table.
Here we are loading data into sample bucket from employees table.
non partitioned:
hive>use empdetails
now hive has to read all 500 rows ,after reading each row
it validates the condition (sex='F'),if true then it writes
row.
hive>describe emp;
eid int
name string
deptno int
/user/hive/warehouse/empdetails.db/emp
sol:
partition(gender='F')//partition name
partition(gender='M')//partition name
note:
Multiple Insertion
hive>from default.emp
In HDFS,
/User
hive
/warehouse/
empdetails.db
emp/
/gender =F
/000000_0
/gender =M
/000000_0
Observe above graph
emp -> is directory for table
gender=F -> is subdirectory of emp
gender=F/000000_0 file contains only female data
gender=M -> is subdirectory of emp
gender=M/000000_0 file contains only male data
Reading data from partitions
dno int
sex String -----> logical columns for partitions
-> Sex= M
/000000_0
-> /dno=1002
-> Sex= F
/000000_0
-> Sex= M
/000000_0
Now, you created,emp1 table, with Partitioned by(dno int, sex
String)
here, dno is primary partition for dno
primary partition is a subdirectory of table's directory and sub
partition is a subdirectory of primary partion.
In this way you can go for multi level sub partitioning
/user/hive/warehouse/mydb.db/emp1/dno=1002/Sex=F/00
0000_0
/user/hive/warehouse/mydb.db/emp1/dno=1002/Sex=M/00
0000_0
/user/hive/warehouse/mydb.db/emp1/dno=1002/Sex=M/00
0000_0
/user/hive/warehouse/mydb.db/emp1/dno=1001/Sex=F/00
0000_0
/user/hive/warehouse/mydb.db/emp1/dno=1002/Sex=M/00
0000_0
In HDFS,
/user
/hive
/warehouse
/mydb.db
/newsales
/y =2001
-> /m=1
d=1/000000-0
d=2/000000_0
............
............
d=31/000000_0
-> /m=12
d=1/000000-0
d=2/000000_0
............
............
d=31/000000_0
/y =2015
-> /m=1
d=1/000000-0
d=2/000000_0
............
............
d=31/000000_0
-> /m=7
d=1/000000-0
d=2/000000_0
............
............
d=19/000000_0
In HDFS,
/user
/hive
/warehouse
/matdb.db
/enquiries
/y=2015
/m=7
/d=1/enquiries1
/d=2/enquiries2
/d=3/enquiries3
...............
...............
I want to listout all enquiries from 2015-10-1 to 2015-10-5,
// hive reads
/user/hive/warehouse/......./emp1/d=1001/grade=F
/user/hive/warehouse/......./emp1/d=1001/grade=M
partitions.
if you set,
hive> set hive.mapred.mode=nonStrict;
// Now can request all partitions of table
now hive has to create 10000 partitions,if you have 100 unique
cities and 100 unique department number(dons) for each one
partition one data file to be cteated. so 10000 files. In this case
NameNode has to maintain metadata for 10000 files. this is
greate overhead(burder) for namenode.what if, if you are
partitioning your sales table by product_id, if you have millions
of products? very very burden on Namenode for maintaining
metadata. In such case, we choose Bucketing tables.
Hive UDFS:
package com.mat.hive.udf;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
public final class UpperUDF extends UDF
{
public Text evaluate(final Text s)
{
if(s==null)
{
return null;
}
return new Text(s.toString().toUpperCase());
}
}
5)
6)
$ export HIVE_AUX_JARS_PATH=/home/matuser
once you have hive running the last step is register your
function
views:
FROM(
SELECT * FROM empmaster e1 JOIN dept d
ON(e1.deptid=d.deptid)where d.deptid=1001
)e select e.* WHERE ename like 'g%';
the view is used like any other table. In this query we added a
WHERE clause to the
SELECT statement. This exactly emulates the original query:
Bucketing tables
/user/hive/warehouse/mudb.db/emp_buckets/000001_0
/user/hive/warehouse/mudb.db/emp_buckets/000002_0
hive maintains hastable, which key stored in which buckets
when you request data using bucketing column, First hive
understands, the address of (bucketed)
key by reading hashtable, and reads data only from that
bucket, without reading from other buckets
Hashtable
Hashcode $ key Bucket
11 000000_0
12 000001_0
13 000002_0
14 000000_0
15 000000_0
16 000002_0
17 000003_0
18 000003_0
-------------------
-------------------
-------------------
-------------------
Case Study on how to use HIVE on Top of the HADOOP and different statistical
analysis. In this example we have a predefined dataset (cricket.csv) having more than 20
columns and more than 10000 records in it. In the below example we are explaining how
to display few records based on attributes of cricket. The cricket.csv file has different
attributes like
name,country,Player_iD,Year.stint,teamid,run,igl,D,G,G_batting.AB,R,H,2B,3B,HR,RBI
,SB,CS,BB,SO,I BB,HBP,SH,SF,GIDP,G_OLD
Follow the below steps in order to retrieve the data from Big tables.
5.Load the Data from csv file to table temp_batting hive> LOAD DATA INPATH
'/user/sai/sampledataset/hivedataset/cricket.cs v' OVERWRITE INTO TABLE
temp_cricket;
6.Create Another table to Extract data hive>create table batting (player_id STRING, year
INT, runs INT);
Example 1 Display year and maximum runs scored in each year hive> SELECT year,
max(runs) FROM batting GROUP BY year;
Example 2 Display player_id,year and maximum runs scored in each year hive>
SELECT a.year, a.player_id, a.runs from batting a JOIN (SELECT year, max(runs) runs
FROM batting GROUP BY year ) b ON (a.year = b.year AND a.runs = b.runs) ;
Total Jobs=2 Launching job 1 out of 2 hadoop job information for stage-2: Number of
mappers:1;Number of reducers:1 Stage-2 map=0% reduce =0% Stage-2 map=100%
reduce =0% cumulative CPU 3.0 sec Stage-2 map=100% reduce =100% cumulative CPU
5.95 sec MapReduce Total cumulative CPU time:5.95 sec Launching job 2 out of 2
hadoop job information for stage-3: Number of mappers:1;Number of reducers:0 Stage-3
map=0% reduce =0% Stage-3 map=100% reduce =0% cumulative CPU 3.12 sec Stage-3
map=100% reduce =100% cumulative CPU 3.12 sec MapReduce Total cumulative CPU
time:3.12 sec Time taken :65.743sec
As there are two queries (Select Statements) two jobs have been executed. The CPU time
varies according to the system configuration
Conclusion Hive works with Hadoop to allow you to query and manage large-scale data
using a familiar SQL-like interface. Hive provides command line interface to the shell