This dataset includes murders from the FBI’s Supplementary Homicide Report from 1976 to the present and Freedom of Information Act data on more than 22,000 homicides that were not reported to the Justice Department.This dataset includes the age, race, sex, ethnicity of victims and perpetrators,in addition to the relationship between the victim and perpetrator and weapon used.
We try to answer few questions from this dataset like,what is the trend of homicide over the years,most used weapon for killing,which age group is susceptable to killing, among other interesting things explored.
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(formattable)
library(ggthemes)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
homi=read.csv("database.csv",header=TRUE,stringsAsFactors = FALSE)
glimpse(homi)
## Observations: 638,454
## Variables: 24
## $ Record.ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1...
## $ Agency.Code <chr> "AK00101", "AK00101", "AK00101", "AK0010...
## $ Agency.Name <chr> "Anchorage", "Anchorage", "Anchorage", "...
## $ Agency.Type <chr> "Municipal Police", "Municipal Police", ...
## $ City <chr> "Anchorage", "Anchorage", "Anchorage", "...
## $ State <chr> "Alaska", "Alaska", "Alaska", "Alaska", ...
## $ Year <int> 1980, 1980, 1980, 1980, 1980, 1980, 1980...
## $ Month <chr> "January", "March", "March", "April", "A...
## $ Incident <int> 1, 1, 2, 1, 2, 1, 2, 1, 2, 3, 1, 2, 3, 1...
## $ Crime.Type <chr> "Murder or Manslaughter", "Murder or Man...
## $ Crime.Solved <chr> "Yes", "Yes", "No", "Yes", "No", "Yes", ...
## $ Victim.Sex <chr> "Male", "Male", "Female", "Male", "Femal...
## $ Victim.Age <int> 14, 43, 30, 43, 30, 30, 42, 99, 32, 38, ...
## $ Victim.Race <chr> "Native American/Alaska Native", "White"...
## $ Victim.Ethnicity <chr> "Unknown", "Unknown", "Unknown", "Unknow...
## $ Perpetrator.Sex <chr> "Male", "Male", "Unknown", "Male", "Unkn...
## $ Perpetrator.Age <int> 15, 42, 0, 42, 0, 36, 27, 35, 0, 40, 0, ...
## $ Perpetrator.Race <chr> "Native American/Alaska Native", "White"...
## $ Perpetrator.Ethnicity <chr> "Unknown", "Unknown", "Unknown", "Unknow...
## $ Relationship <chr> "Acquaintance", "Acquaintance", "Unknown...
## $ Weapon <chr> "Blunt Object", "Strangulation", "Unknow...
## $ Victim.Count <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ Perpetrator.Count <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0...
## $ Record.Source <chr> "FBI", "FBI", "FBI", "FBI", "FBI", "FBI"...
There are 638454 observations with 24 variables.
dim(homi)
## [1] 638454 24
table(homi$Crime.Type)
##
## Manslaughter by Negligence Murder or Manslaughter
## 9116 629338
(table(homi$Crime.Type[homi$Crime.Type=="Murder or
Manslaughter"])/length(homi$Crime.Type))
## numeric(0)
table(homi$Crime.Solved)
##
## No Yes
## 190282 448172
(table(homi$Crime.Solved[homi$Crime.Solved=="Yes"])/length(homi$Crime.Solved))
##
## Yes
## 0.7019644
temp=homi %>% select(Crime.Solved,Crime.Type) %>% filter(Crime.Solved=="Yes",Crime.Type=="Murder or Manslaughter")
table(temp)
## Crime.Type
## Crime.Solved Murder or Manslaughter
## Yes 439444
439444/629338
## [1] 0.6982639
temp=homi %>% select(Crime.Solved,Crime.Type) %>% filter(Crime.Solved=="Yes",Crime.Type=="Manslaughter by Negligence")
table(temp)
## Crime.Type
## Crime.Solved Manslaughter by Negligence
## Yes 8728
8728/9116
## [1] 0.9574375
almost 98 % of the crimes belong to murder or manslaughter type.
70 % of the total crimes registered have been solved.
69 % of the cases under Murder or Manslaugher have been solved.While 95 % of the cases under Manslaughter by negligence has been solved.
temp= homi %>% select(Agency.Type) %>% group_by(Agency.Type) %>% summarise(count=n()) %>% mutate(perc=round((count/sum(count))*100)) %>% arrange(desc(perc))
temp%>% select(Agency.Type,perc) %>% formattable(list(perc=color_bar("red")),align='l')
Agency.Type | perc |
---|---|
Municipal Police | 77 |
Sheriff | 16 |
County Police | 4 |
State Police | 2 |
Regional Police | 0 |
Special Police | 0 |
Tribal Police | 0 |
77 % of the crimes are handled by Municipal Police.
We declare a function for default theme that we will use through out this kernel.
themefu=function(){theme(text=element_text(color="red"),plot.title=element_text(size=12,color="black",hjust=0.5),plot.subtitle=element_text(face="italic"),axis.title.x=element_text(hjust=1),axis.title.y=element_text(hjust=1),legend.background = element_blank(),legend.title=element_text(color="black",face="bold"),legend.text=element_text(color="black",face="bold"))}
ggplot(homi,aes(Month,fill=Month))+geom_bar(stat="count",show.legend="na")+labs(x="Month of Crime",y="Count of Crime",caption="Source:FBI's homicide Report",title="Murder Trend by month")+facet_wrap(~Crime.Type)+scale_y_continuous(limits=c(0,60000),breaks=seq(0,60000,5000))+themefu()+theme(axis.text.x=element_text(angle=90,vjust=1))
## Warning: `show.legend` must be a logical vector of length 1.
Not a surprise since,the dataset is dominated by Murder or Manslaughter than Murder by negligence.July seems to have a spike in the overall crime rate going by the graph.Therefore lets take into consideration only Murder or Manslaughter when ploting the time series trend.
temp=homi %>% select(Year,Crime.Type) %>% group_by(Year,Crime.Type) %>% summarise(count=n()) %>% na.omit()
ggplot(temp,aes(Year,count,group=Crime.Type,color=Crime.Type))+geom_line()+geom_point(size=2,shape=1)+theme_hc(bgcolor="darkunica")+scale_fill_hc("darkunica")+labs(x="Year",y="Count",caption="Source:FBI's homicide report",title="Crime trend over the years")+geom_text(aes(label=ifelse(count>250,count,"")), size=3,hjust=1.8)+scale_x_continuous(limits=c(1980,2014),breaks=seq(1980,2014,2))
The crimes seems to have dropped in 2010 compared to 1980’s which is a good trend.
Now,let us examine the variable victim sex and preparator sex.
unique(homi$Victim.Sex)
## [1] "Male" "Female" "Unknown"
temp=homi %>% select(Victim.Sex,Perpetrator.Sex)
temp=temp %>% mutate(whokilledwho=case_when(.$Victim.Sex=="Male" & .$Perpetrator.Sex=="Male"~"Male killed by male",.$Victim.Sex=="Male" & .$Perpetrator.Sex=="Female"~"Male killed by female",.$Victim.Sex=="Male" & .$Perpetrator.Sex=="Unknown"~"Male killed by unknown",.$Victim.Sex=="Female" & .$Perpetrator.Sex=="Male"~"Female killed by male",.$Victim.Sex=="Female" & .$Perpetrator.Sex=="Female"~"Female killed by female",.$Victim.Sex=="Female" & .$Perpetrator.Sex=="Unknown"~"Female killed by unknown",.$Victim.Sex=="Unkown" & .$Perpetrator.Sex=="Male"~"Unknown killed by male",.$Victim.Sex=="Unknown" & .$Perpetrator.Sex=="Female"~"Unknown killed by female",TRUE~"Unknown killed by Unknown"))
temp=temp %>% group_by(whokilledwho) %>% summarise(number=n()) %>% arrange(desc(number)) %>% mutate(perc=round(number/sum(number)*100))
ggplot(temp,aes(whokilledwho,number,fill=whokilledwho))+geom_bar(stat="identity")+themefu()+labs(x=" ",y="Count",title="Victim Vs Preparator")+theme(axis.text.x = element_text(angle=90,vjust=0.5),legend.position="none")+geom_text(aes(label=number),vjust=0.7,color="black")
temp %>% formattable(align=c("l","l","l"),list(perc=color_tile("orange","red"),area(col="number")~normalize_bar("lightgrey")))
whokilledwho | number | perc |
---|---|---|
Male killed by male | 299879 | 47 |
Male killed by unknown | 156617 | 25 |
Female killed by male | 99381 | 16 |
Male killed by female | 37629 | 6 |
Female killed by unknown | 33095 | 5 |
Female killed by female | 10869 | 2 |
Unknown killed by Unknown | 934 | 0 |
Unknown killed by female | 50 | 0 |
About 49 % of the crimes involved males being killed by males followed 25 % by unknown perpetrator.But the gap between them is quite large -around 24 %.
Let us see,who how the perpetrator are related to the victim.For this we consider the solved crimes since they seem to dominate the data and hence the inference can be derived based on that data.
temp=homi %>% filter(Crime.Solved=="Yes",!Victim.Sex=="Unknown",!Perpetrator.Sex=="Unknown")
temp %>% ggplot(aes(Relationship,fill=Relationship))+geom_bar(stat="count")+themefu()+theme(axis.text.x = element_text(angle=90,vjust=0.5),legend.position="none")+labs(x="Relationship",y="Count",caption="Source:FBI's Homicide data",title="Relation between victim and Perpetrator",subtitle="x grid is Victim,y grid is Perpetrator")+facet_grid(Victim.Sex ~ Perpetrator.Sex,scales="free_y")
Since 49 % of the crimes involved males,there is a high dominance of Male Vs Male and Male Vs Female.
One revelation from this dataset is that males are more likely to be killed by their wife(!!) followed by their girlfriends if the perpetrator turns out to be female while if the perpetrator is male,then it would either be his acquaintaince or a stranger.
Let us examine which weapons are used for killing based on the victim and perpetrator sex.
length(unique(homi$Weapon))
## [1] 16
ggplot(temp,aes(Weapon,fill=Weapon))+geom_bar(stat="count")+themefu()+facet_grid(Victim.Sex~Perpetrator.Sex,scales="free_y")+labs(x="Weapons Used",y="Count",title="Weapon used by Perpetrator",subtitle="xgrid=Victim,ygrid=Perpetrator")+theme(axis.text.x = element_text(angle=90,vjust=0.5),legend.position="None")
Hand gun seems to be the preferred weapon to kill the Males.This is not surprising given the prevelance of widespread gun culture in US.Blunt objects,Knife,shotgun are also prefered.
min(temp$Victim.Age)
## [1] 0
max(temp$Victim.Age)
## [1] 998
There seems to be a discrepancy in the data since the min and max age is 0 and 998.
temp=homi %>% select(Victim.Age,Perpetrator.Age) %>% group_by(Victim.Age,Perpetrator.Age) %>% mutate(count=n())
head(temp,20) %>% formattable(align=c("l","l","l"),list(count=color_tile("green","red")))
Victim.Age | Perpetrator.Age | count |
---|---|---|
14 | 15 | 259 |
43 | 42 | 168 |
30 | 0 | 7186 |
43 | 42 | 168 |
30 | 0 | 7186 |
30 | 36 | 283 |
42 | 27 | 159 |
99 | 35 | 89 |
32 | 0 | 5450 |
38 | 40 | 237 |
36 | 0 | 4132 |
20 | 49 | 40 |
36 | 39 | 271 |
20 | 49 | 40 |
48 | 0 | 1859 |
31 | 29 | 461 |
16 | 19 | 551 |
33 | 23 | 286 |
27 | 33 | 331 |
33 | 35 | 386 |
A look at the first 20 rows in the table indicate that the discrepancy in the age data is prevalent in victims and perpetrators.Therefore to get a better sense of data we filter the data above 18 years of age.
temp = temp %>% filter(Victim.Age>=18 & Perpetrator.Age >=18 ) %>% group_by(Victim.Age,Perpetrator.Age) %>% summarise(n=sum(count))
temp1=temp %>% arrange(desc(n))
g1=ggplot(head(temp1,20),aes(Victim.Age,n))+geom_bar(stat="identity")+theme(legend.position="none")+labs(x="Victim Age",y="Count",title="Victim age distribution",subtitle="TOP 20 by count")+scale_x_continuous(limits=c(18,26),breaks=seq(18,26,1))+scale_fill_canva(palette="Fresh and bright")
g2=ggplot(head(temp1,20),aes(Perpetrator.Age,n,fill="darkgreen"))+geom_bar(stat="identity")+theme(legend.position="none")+labs(x="Perpetrator Age",y="Count",title="Perpetrator age distribution",subtitle="TOP 20 by count")+scale_x_continuous(limits=c(18,26),breaks=seq(18,26,1))
grid.arrange(arrangeGrob(g1,g2,ncol=2))
It is seen that the top 20 of the age data indicates that both victim and perpetrator between age group 18-26 are vulnerable to homicide death’s and for victims aged 19 years are more vulnerable.
length(unique(homi$Victim.Race))
## [1] 5
length(unique(homi$Perpetrator.Race))
## [1] 5
temp=homi %>% select(Victim.Race,Perpetrator.Race) %>% group_by(Victim.Race,Perpetrator.Race) %>% summarise(count=n())
ggplot(temp,aes(Victim.Race,count))+geom_bar(stat="identity")+theme_solarized()+facet_wrap(~Perpetrator.Race,scales="free_y")+labs(x="Victim Race",y="Count",title="Crime trend by Race",subtitle="Which race is most vulnerable?")+theme(legend.position="none",axis.text.x = element_text(angle=90,vjust=0.5))+scale_color_solarized('lightyellow')
length(unique(homi$State))
## [1] 51
temp=homi %>% group_by(State) %>% summarise(count=n()) %>% arrange(desc(count)) %>% mutate(perc=round((count/sum(count))*100,2))
ggplot(head(temp,10),aes(factor(State,levels=State),count,fill=State))+geom_bar(stat="identity")+theme_hc()+scale_fill_hc()+geom_text(aes(label=count),color="black",vjust=1)+theme(axis.text.x=element_text(angle=90,vjust=0.5),legend.position="none")+labs(x="State",y="Count",title=" Total Crimes in City",subtitle="Top 10")
head(temp,20) %>% select(State,perc) %>% formattable(align=c("l","l"),list(perc=color_bar("pink")))
State | perc |
---|---|
California | 15.63 |
Texas | 9.73 |
New York | 7.72 |
Florida | 5.82 |
Michigan | 4.46 |
Illinois | 4.05 |
Pennsylvania | 3.80 |
Georgia | 3.30 |
North Carolina | 3.19 |
Louisiana | 3.07 |
Ohio | 3.00 |
Maryland | 2.71 |
Virginia | 2.43 |
Tennessee | 2.34 |
Missouri | 2.32 |
New Jersey | 2.21 |
Arizona | 2.02 |
South Carolina | 1.83 |
Indiana | 1.80 |
Alabama | 1.78 |
Over the years,(from 1980 to 2014),California,Texas,New York,Florida,Michigan has topped the homicide crime.This accounts for almost 50 % of the crimes.
Thus in this project we have tried to explore the homicide dataset that has 24 variables.We have done the exploratory data analysis using univariate,bivariate datas and tried to gather interesting insights.
We have extensively made use of packages like ggplot2 and dplyr through out this project and also explored the usage of formattable,ggthemes packages.