Introduction

Loading libraries and data

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(formattable)
library(ggthemes)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
homi=read.csv("database.csv",header=TRUE,stringsAsFactors = FALSE)
glimpse(homi)
## Observations: 638,454
## Variables: 24
## $ Record.ID             <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1...
## $ Agency.Code           <chr> "AK00101", "AK00101", "AK00101", "AK0010...
## $ Agency.Name           <chr> "Anchorage", "Anchorage", "Anchorage", "...
## $ Agency.Type           <chr> "Municipal Police", "Municipal Police", ...
## $ City                  <chr> "Anchorage", "Anchorage", "Anchorage", "...
## $ State                 <chr> "Alaska", "Alaska", "Alaska", "Alaska", ...
## $ Year                  <int> 1980, 1980, 1980, 1980, 1980, 1980, 1980...
## $ Month                 <chr> "January", "March", "March", "April", "A...
## $ Incident              <int> 1, 1, 2, 1, 2, 1, 2, 1, 2, 3, 1, 2, 3, 1...
## $ Crime.Type            <chr> "Murder or Manslaughter", "Murder or Man...
## $ Crime.Solved          <chr> "Yes", "Yes", "No", "Yes", "No", "Yes", ...
## $ Victim.Sex            <chr> "Male", "Male", "Female", "Male", "Femal...
## $ Victim.Age            <int> 14, 43, 30, 43, 30, 30, 42, 99, 32, 38, ...
## $ Victim.Race           <chr> "Native American/Alaska Native", "White"...
## $ Victim.Ethnicity      <chr> "Unknown", "Unknown", "Unknown", "Unknow...
## $ Perpetrator.Sex       <chr> "Male", "Male", "Unknown", "Male", "Unkn...
## $ Perpetrator.Age       <int> 15, 42, 0, 42, 0, 36, 27, 35, 0, 40, 0, ...
## $ Perpetrator.Race      <chr> "Native American/Alaska Native", "White"...
## $ Perpetrator.Ethnicity <chr> "Unknown", "Unknown", "Unknown", "Unknow...
## $ Relationship          <chr> "Acquaintance", "Acquaintance", "Unknown...
## $ Weapon                <chr> "Blunt Object", "Strangulation", "Unknow...
## $ Victim.Count          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ Perpetrator.Count     <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0...
## $ Record.Source         <chr> "FBI", "FBI", "FBI", "FBI", "FBI", "FBI"...

There are 638454 observations with 24 variables.

Crime rate by %

dim(homi)
## [1] 638454     24
table(homi$Crime.Type)
## 
## Manslaughter by Negligence     Murder or Manslaughter 
##                       9116                     629338
(table(homi$Crime.Type[homi$Crime.Type=="Murder or
                       Manslaughter"])/length(homi$Crime.Type))
## numeric(0)
table(homi$Crime.Solved)
## 
##     No    Yes 
## 190282 448172
(table(homi$Crime.Solved[homi$Crime.Solved=="Yes"])/length(homi$Crime.Solved))
## 
##       Yes 
## 0.7019644
temp=homi %>% select(Crime.Solved,Crime.Type) %>% filter(Crime.Solved=="Yes",Crime.Type=="Murder or Manslaughter")
table(temp)
##             Crime.Type
## Crime.Solved Murder or Manslaughter
##          Yes                 439444
439444/629338
## [1] 0.6982639
temp=homi %>% select(Crime.Solved,Crime.Type) %>% filter(Crime.Solved=="Yes",Crime.Type=="Manslaughter by Negligence")
table(temp)
##             Crime.Type
## Crime.Solved Manslaughter by Negligence
##          Yes                       8728
8728/9116
## [1] 0.9574375

almost 98 % of the crimes belong to murder or manslaughter type.

70 % of the total crimes registered have been solved.

69 % of the cases under Murder or Manslaugher have been solved.While 95 % of the cases under Manslaughter by negligence has been solved.

Agency Type

temp= homi %>% select(Agency.Type) %>% group_by(Agency.Type) %>% summarise(count=n()) %>% mutate(perc=round((count/sum(count))*100)) %>% arrange(desc(perc))
temp%>% select(Agency.Type,perc) %>% formattable(list(perc=color_bar("red")),align='l')
Agency.Type perc
Municipal Police 77
Sheriff 16
County Police 4
State Police 2
Regional Police 0
Special Police 0
Tribal Police 0

77 % of the crimes are handled by Municipal Police.

We declare a function for default theme that we will use through out this kernel.

themefu=function(){theme(text=element_text(color="red"),plot.title=element_text(size=12,color="black",hjust=0.5),plot.subtitle=element_text(face="italic"),axis.title.x=element_text(hjust=1),axis.title.y=element_text(hjust=1),legend.background = element_blank(),legend.title=element_text(color="black",face="bold"),legend.text=element_text(color="black",face="bold"))}

Analysing monthly crime types:

ggplot(homi,aes(Month,fill=Month))+geom_bar(stat="count",show.legend="na")+labs(x="Month of Crime",y="Count of Crime",caption="Source:FBI's homicide Report",title="Murder Trend by month")+facet_wrap(~Crime.Type)+scale_y_continuous(limits=c(0,60000),breaks=seq(0,60000,5000))+themefu()+theme(axis.text.x=element_text(angle=90,vjust=1))
## Warning: `show.legend` must be a logical vector of length 1.

Not a surprise since,the dataset is dominated by Murder or Manslaughter than Murder by negligence.July seems to have a spike in the overall crime rate going by the graph.Therefore lets take into consideration only Murder or Manslaughter when ploting the time series trend.

Time Series data

temp=homi %>% select(Year,Crime.Type) %>% group_by(Year,Crime.Type) %>% summarise(count=n()) %>% na.omit()
ggplot(temp,aes(Year,count,group=Crime.Type,color=Crime.Type))+geom_line()+geom_point(size=2,shape=1)+theme_hc(bgcolor="darkunica")+scale_fill_hc("darkunica")+labs(x="Year",y="Count",caption="Source:FBI's homicide report",title="Crime trend over the years")+geom_text(aes(label=ifelse(count>250,count,"")), size=3,hjust=1.8)+scale_x_continuous(limits=c(1980,2014),breaks=seq(1980,2014,2))

The crimes seems to have dropped in 2010 compared to 1980’s which is a good trend.

Now,let us examine the variable victim sex and preparator sex.

Sex

unique(homi$Victim.Sex)
## [1] "Male"    "Female"  "Unknown"
temp=homi %>% select(Victim.Sex,Perpetrator.Sex) 
temp=temp %>% mutate(whokilledwho=case_when(.$Victim.Sex=="Male" & .$Perpetrator.Sex=="Male"~"Male killed by male",.$Victim.Sex=="Male" & .$Perpetrator.Sex=="Female"~"Male killed by female",.$Victim.Sex=="Male" & .$Perpetrator.Sex=="Unknown"~"Male killed by unknown",.$Victim.Sex=="Female" & .$Perpetrator.Sex=="Male"~"Female killed by male",.$Victim.Sex=="Female" & .$Perpetrator.Sex=="Female"~"Female killed by female",.$Victim.Sex=="Female" & .$Perpetrator.Sex=="Unknown"~"Female killed by unknown",.$Victim.Sex=="Unkown" & .$Perpetrator.Sex=="Male"~"Unknown killed by male",.$Victim.Sex=="Unknown" & .$Perpetrator.Sex=="Female"~"Unknown killed by female",TRUE~"Unknown killed by Unknown"))

temp=temp %>% group_by(whokilledwho) %>% summarise(number=n()) %>% arrange(desc(number)) %>% mutate(perc=round(number/sum(number)*100))
ggplot(temp,aes(whokilledwho,number,fill=whokilledwho))+geom_bar(stat="identity")+themefu()+labs(x=" ",y="Count",title="Victim Vs Preparator")+theme(axis.text.x = element_text(angle=90,vjust=0.5),legend.position="none")+geom_text(aes(label=number),vjust=0.7,color="black")

temp %>% formattable(align=c("l","l","l"),list(perc=color_tile("orange","red"),area(col="number")~normalize_bar("lightgrey")))
whokilledwho number perc
Male killed by male 299879 47
Male killed by unknown 156617 25
Female killed by male 99381 16
Male killed by female 37629 6
Female killed by unknown 33095 5
Female killed by female 10869 2
Unknown killed by Unknown 934 0
Unknown killed by female 50 0

About 49 % of the crimes involved males being killed by males followed 25 % by unknown perpetrator.But the gap between them is quite large -around 24 %.

Let us see,who how the perpetrator are related to the victim.For this we consider the solved crimes since they seem to dominate the data and hence the inference can be derived based on that data.

Relationship type

temp=homi %>% filter(Crime.Solved=="Yes",!Victim.Sex=="Unknown",!Perpetrator.Sex=="Unknown")
temp %>% ggplot(aes(Relationship,fill=Relationship))+geom_bar(stat="count")+themefu()+theme(axis.text.x = element_text(angle=90,vjust=0.5),legend.position="none")+labs(x="Relationship",y="Count",caption="Source:FBI's Homicide data",title="Relation between victim and Perpetrator",subtitle="x grid is Victim,y grid is Perpetrator")+facet_grid(Victim.Sex ~ Perpetrator.Sex,scales="free_y")

Since 49 % of the crimes involved males,there is a high dominance of Male Vs Male and Male Vs Female.

One revelation from this dataset is that males are more likely to be killed by their wife(!!) followed by their girlfriends if the perpetrator turns out to be female while if the perpetrator is male,then it would either be his acquaintaince or a stranger.

Let us examine which weapons are used for killing based on the victim and perpetrator sex.

Weapons used

length(unique(homi$Weapon))
## [1] 16
ggplot(temp,aes(Weapon,fill=Weapon))+geom_bar(stat="count")+themefu()+facet_grid(Victim.Sex~Perpetrator.Sex,scales="free_y")+labs(x="Weapons Used",y="Count",title="Weapon used by Perpetrator",subtitle="xgrid=Victim,ygrid=Perpetrator")+theme(axis.text.x = element_text(angle=90,vjust=0.5),legend.position="None")

Hand gun seems to be the preferred weapon to kill the Males.This is not surprising given the prevelance of widespread gun culture in US.Blunt objects,Knife,shotgun are also prefered.

Age of the victim and perpetrator

min(temp$Victim.Age)
## [1] 0
max(temp$Victim.Age)
## [1] 998

There seems to be a discrepancy in the data since the min and max age is 0 and 998.

temp=homi %>% select(Victim.Age,Perpetrator.Age) %>% group_by(Victim.Age,Perpetrator.Age) %>% mutate(count=n())
head(temp,20) %>% formattable(align=c("l","l","l"),list(count=color_tile("green","red")))
Victim.Age Perpetrator.Age count
14 15 259
43 42 168
30 0 7186
43 42 168
30 0 7186
30 36 283
42 27 159
99 35 89
32 0 5450
38 40 237
36 0 4132
20 49 40
36 39 271
20 49 40
48 0 1859
31 29 461
16 19 551
33 23 286
27 33 331
33 35 386

A look at the first 20 rows in the table indicate that the discrepancy in the age data is prevalent in victims and perpetrators.Therefore to get a better sense of data we filter the data above 18 years of age.

temp = temp %>% filter(Victim.Age>=18 & Perpetrator.Age >=18 ) %>% group_by(Victim.Age,Perpetrator.Age) %>% summarise(n=sum(count))
temp1=temp %>% arrange(desc(n))
g1=ggplot(head(temp1,20),aes(Victim.Age,n))+geom_bar(stat="identity")+theme(legend.position="none")+labs(x="Victim Age",y="Count",title="Victim age distribution",subtitle="TOP 20 by count")+scale_x_continuous(limits=c(18,26),breaks=seq(18,26,1))+scale_fill_canva(palette="Fresh and bright")
g2=ggplot(head(temp1,20),aes(Perpetrator.Age,n,fill="darkgreen"))+geom_bar(stat="identity")+theme(legend.position="none")+labs(x="Perpetrator Age",y="Count",title="Perpetrator age distribution",subtitle="TOP 20 by count")+scale_x_continuous(limits=c(18,26),breaks=seq(18,26,1))
grid.arrange(arrangeGrob(g1,g2,ncol=2))

It is seen that the top 20 of the age data indicates that both victim and perpetrator between age group 18-26 are vulnerable to homicide death’s and for victims aged 19 years are more vulnerable.

Race - Which race is more vulnerable

length(unique(homi$Victim.Race))
## [1] 5
length(unique(homi$Perpetrator.Race))
## [1] 5
temp=homi %>% select(Victim.Race,Perpetrator.Race) %>% group_by(Victim.Race,Perpetrator.Race) %>% summarise(count=n())

ggplot(temp,aes(Victim.Race,count))+geom_bar(stat="identity")+theme_solarized()+facet_wrap(~Perpetrator.Race,scales="free_y")+labs(x="Victim Race",y="Count",title="Crime trend by Race",subtitle="Which race is most vulnerable?")+theme(legend.position="none",axis.text.x = element_text(angle=90,vjust=0.5))+scale_color_solarized('lightyellow')

State and Crime Rate

length(unique(homi$State))
## [1] 51
temp=homi %>% group_by(State) %>% summarise(count=n()) %>% arrange(desc(count)) %>% mutate(perc=round((count/sum(count))*100,2))
ggplot(head(temp,10),aes(factor(State,levels=State),count,fill=State))+geom_bar(stat="identity")+theme_hc()+scale_fill_hc()+geom_text(aes(label=count),color="black",vjust=1)+theme(axis.text.x=element_text(angle=90,vjust=0.5),legend.position="none")+labs(x="State",y="Count",title=" Total Crimes in City",subtitle="Top 10")

head(temp,20) %>% select(State,perc) %>% formattable(align=c("l","l"),list(perc=color_bar("pink")))
State perc
California 15.63
Texas 9.73
New York 7.72
Florida 5.82
Michigan 4.46
Illinois 4.05
Pennsylvania 3.80
Georgia 3.30
North Carolina 3.19
Louisiana 3.07
Ohio 3.00
Maryland 2.71
Virginia 2.43
Tennessee 2.34
Missouri 2.32
New Jersey 2.21
Arizona 2.02
South Carolina 1.83
Indiana 1.80
Alabama 1.78

Over the years,(from 1980 to 2014),California,Texas,New York,Florida,Michigan has topped the homicide crime.This accounts for almost 50 % of the crimes.

Conclusion